445 files changed, 13767 insertions, 4651 deletions
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 000000000..cf782a580
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,21 @@
+name: "Build"
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  default:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/bazel
+        key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }}
+        restore-keys: |
+          ${{ runner.os }}-bazel-
+    - run: make
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 000000000..10c86f5cd
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,66 @@
+name: "Go"
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  generate:
+    runs-on: ubuntu-latest
+    steps:
+    - run: |
+        jq -nc '{"state": "pending", "context": "go tests"}' | \
+        curl -sL -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
+      if: github.event_name == 'pull_request'
+    - uses: actions/checkout@v2
+      if: github.event_name == 'push'
+      with:
+        fetch-depth: 0
+        token: '${{ secrets.GO_TOKEN }}'
+    - uses: actions/checkout@v2
+      if: github.event_name == 'pull_request'
+      with:
+        fetch-depth: 0
+    - uses: actions/setup-go@v2
+      with:
+        go-version: 1.14
+    - uses: actions/cache@v1
+      with:
+        path: ~/go/pkg/mod
+        key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
+        restore-keys: |
+          ${{ runner.os }}-go-
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/bazel
+        key: ${{ runner.os }}-bazel-${{ hashFiles('WORKSPACE') }}
+        restore-keys: |
+          ${{ runner.os }}-bazel-
+    - run: make build TARGETS="//:gopath"
+    - run: tools/go_branch.sh
+    - run: git checkout go && git clean -f
+    - run: go build ./...
+    - if: github.event_name == 'push'
+      run: |
+        git remote add upstream "https://github.com/${{ github.repository }}"
+        git push upstream go:go
+    - if: ${{ success() && github.event_name == 'pull_request' }}
+      run: |
+        jq -nc '{"state": "success", "context": "go tests"}' | \
+        curl -sL  -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
+    - if: ${{ failure() && github.event_name == 'pull_request' }}
+      run: |
+        jq -nc '{"state": "failure", "context": "go tests"}' | \
+        curl -sL  -X POST -d @- \
+            -H "Content-Type: application/json" \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            "${{ github.event.pull_request.statuses_url }}"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 000000000..0b31fecf5
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,20 @@
+name: "Close stale issues"
+on:
+  schedule:
+  - cron: "0 0 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/stale@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        exempt-issue-labels: 'exported, type: bug, type: cleanup, type: enhancement, type: process, type: proposal, type: question'
+        exempt-pr-labels: 'ready to pull'
+        stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove the stale label or comment or this will be closed in 30 days.'
+        stale-pr-message: 'This pull request is stale because it has been open 90 days with no activity. Remove the stale label or comment or this will be closed in 30 days.'
+        days-before-stale: 90
+        days-before-close: 30
diff --git a/.travis.yml b/.travis.yml
index 40c8773fa..4cb202b7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,7 +13,10 @@ jobs:
    - os: linux
      arch: arm64
 script:
-   - uname -a && make smoke-test
+   # On arm64, we need to create our own pipes for stderr and stdout,
+   # otherwise we will not be able to open /dev/stderr. This is probably
+   # due to AppArmor rules.
+   - bash -xeo pipefail -c 'uname -a && make smoke-test 2>&1 | cat'
 branches:
   except:
   # Skip copybara branches.
diff --git a/Makefile b/Makefile
index 7f382695d..85818ebea 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@
 
 # Described below.
 OPTIONS :=
+STARTUP_OPTIONS :=
 TARGETS := //runsc
 ARGS    :=
 
@@ -24,7 +25,7 @@ default: runsc
 
 ## usage: make <target>
 ##         or
-##        make <build|test|copy|run|sudo> OPTIONS="..." TARGETS="..." ARGS="..."
+##        make <build|test|copy|run|sudo> STARTUP_OPTIONS="..." OPTIONS="..." TARGETS="..." ARGS="..."
 ##
 ## Basic targets.
 ##
@@ -33,6 +34,7 @@ default: runsc
 ##   requirements.
 ##
 ##   There are common arguments that may be passed to targets. These are:
+##     STARTUP_OPTIONS - Bazel startup options.
 ##     OPTIONS - Build or test options.
 ##     TARGETS - The bazel targets.
 ##     ARGS    - Arguments for run or sudo.
@@ -116,7 +118,7 @@ unit-tests: ## Runs all unit tests in pkg runsc and tools.
 .PHONY: unit-tests
 
 tests: ## Runs all local ptrace system call tests.
-	@$(MAKE) test OPTIONS="--test_tag_filter runsc_ptrace test/syscalls/..."
+	@$(MAKE) test OPTIONS="--test_tag_filters runsc_ptrace test/syscalls/..."
 .PHONY: tests
 
 ##
@@ -152,6 +154,52 @@ website-deploy: website-push ## Deploy a new version of the website.
 .PHONY: website-push
 
 ##
+## Repository builders.
+##
+##   This builds a local apt repository. The following variables may be set:
+##     RELEASE_ROOT    - The repository root (default: "repo" directory).
+##     RELEASE_KEY     - The repository GPG private key file (default: dummy key is created).
+##     RELEASE_NIGHTLY - Set to true if a nightly release (default: false).
+##     RELEASE_COMMIT  - The commit or Change-Id for the release (needed for tag).
+##     RELEASE_NAME    - The name of the release in the proper format (needed for tag).
+##     RELEASE_NOTES   - The file containing release notes (needed for tag).
+##
+RELEASE_ROOT    := $(CURDIR)/repo
+RELEASE_KEY     := repo.key
+RELEASE_NIGHTLY := false
+RELEASE_COMMIT  :=
+RELEASE_NAME    :=
+RELEASE_NOTES   :=
+
+GPG_TEST_OPTIONS := $(shell if gpg --pinentry-mode loopback --version >/dev/null 2>&1; then echo --pinentry-mode loopback; fi)
+$(RELEASE_KEY):
+	@echo "WARNING: Generating a key for testing ($@); don't use this."
+	T=$$(mktemp /tmp/keyring.XXXXXX); \
+	C=$$(mktemp /tmp/config.XXXXXX); \
+	echo Key-Type: DSA >> $$C && \
+	echo Key-Length: 1024 >> $$C && \
+	echo Name-Real: Test >> $$C && \
+	echo Name-Email: test@example.com >> $$C && \
+	echo Expire-Date: 0 >> $$C && \
+	echo %commit >> $$C && \
+	gpg --batch $(GPG_TEST_OPTIONS) --passphrase '' --no-default-keyring --keyring $$T --no-tty --gen-key $$C && \
+	gpg --batch $(GPG_TEST_OPTIONS) --export-secret-keys --no-default-keyring --keyring $$T --secret-keyring $$T > $@; \
+	rc=$$?; rm -f $$T $$C; exit $$rc
+
+release: $(RELEASE_KEY) ## Builds a release.
+	@mkdir -p $(RELEASE_ROOT)
+	@T=$$(mktemp -d /tmp/release.XXXXXX); \
+	  $(MAKE) copy TARGETS="runsc" DESTINATION=$$T && \
+	  $(MAKE) copy TARGETS="runsc:runsc-debian" DESTINATION=$$T && \
+	  NIGHTLY=$(RELEASE_NIGHTLY) tools/make_release.sh $(RELEASE_KEY) $(RELEASE_ROOT) $$T/*; \
+	rc=$$?; rm -rf $$T; exit $$rc
+.PHONY: release
+
+tag: ## Creates and pushes a release tag.
+	@tools/tag_release.sh "$(RELEASE_COMMIT)" "$(RELEASE_NAME)" "$(RELEASE_NOTES)"
+.PHONY: tag
+
+##
 ## Development helpers and tooling.
 ##
 ##   These targets faciliate local development by automatically
@@ -179,6 +227,7 @@ dev: ## Installs a set of local runtimes. Requires sudo.
 	@$(MAKE) configure RUNTIME="$(RUNTIME)" ARGS="--net-raw"
 	@$(MAKE) configure RUNTIME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets"
 	@$(MAKE) configure RUNTIME="$(RUNTIME)-p" ARGS="--net-raw --profile"
+	@$(MAKE) configure RUNTIME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2"
 	@sudo systemctl restart docker
 .PHONY: dev
 
diff --git a/README.md b/README.md
index de3e06f4e..d72d1dac4 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 ![gVisor](g3doc/logo.png)
 
-[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
+![](https://github.com/google/gvisor/workflows/Build/badge.svg)
 [![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community)
 
 ## What is gVisor?
 
-**gVisor** is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an
+**gVisor** is an application kernel, written in Go, that implements a
+substantial portion of the Linux system surface. It includes an
 [Open Container Initiative (OCI)][oci] runtime called `runsc` that provides an
 isolation boundary between the application and the host kernel. The `runsc`
 runtime integrates with Docker and Kubernetes, making it simple to run sandboxed
@@ -15,16 +15,17 @@ containers.
 ## Why does gVisor exist?
 
 Containers are not a [**sandbox**][sandbox]. While containers have
-revolutionized how we develop, package, and deploy applications, running
-untrusted or potentially malicious code without additional isolation is not a
-good idea. The efficiency and performance gains from using a single, shared
-kernel also mean that container escape is possible with a single vulnerability.
-
-gVisor is a user-space kernel for containers. It limits the host kernel surface
-accessible to the application while still giving the application access to all
-the features it expects. Unlike most kernels, gVisor does not assume or require
-a fixed set of physical resources; instead, it leverages existing host kernel
-functionality and runs as a normal user-space process. In other words, gVisor
+revolutionized how we develop, package, and deploy applications, using them to
+run untrusted or potentially malicious code without additional isolation is not
+a good idea. While using a single, shared kernel allows for efficiency and
+performance gains, it also means that container escape is possible with a single
+vulnerability.
+
+gVisor is an application kernel for containers. It limits the host kernel
+surface accessible to the application while still giving the application access
+to all the features it expects. Unlike most kernels, gVisor does not assume or
+require a fixed set of physical resources; instead, it leverages existing host
+kernel functionality and runs as a normal process. In other words, gVisor
 implements Linux by way of Linux.
 
 gVisor should not be confused with technologies and tools to harden containers
@@ -39,75 +40,44 @@ be found at [gvisor.dev][gvisor-dev].
 
 ## Installing from source
 
-gVisor currently requires x86\_64 Linux to build, though support for other
-architectures may become available in the future.
+gVisor builds on x86_64 and ARM64. Other architectures may become available in
+the future.
+
+For the purposes of these instructions, [bazel][bazel] and other build
+dependencies are wrapped in a build container. It is possible to use
+[bazel][bazel] directly, or type `make help` for standard targets.
 
 ### Requirements
 
 Make sure the following dependencies are installed:
 
 *   Linux 4.14.77+ ([older linux][old-linux])
-*   [git][git]
-*   [Bazel][bazel] 1.2+
-*   [Python][python]
 *   [Docker version 17.09.0 or greater][docker]
-*   C++ toolchain supporting C++17 (GCC 7+, Clang 5+)
-*   Gold linker (e.g. `binutils-gold` package on Ubuntu)
 
 ### Building
 
 Build and install the `runsc` binary:
 
 ```
-bazel build runsc
-sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
-```
-
-If you don't want to install bazel on your system, you can build runsc in a
-Docker container:
-
-```
 make runsc
 sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
 ```
 
 ### Testing
 
-The test suite can be run with Bazel:
-
-```
-bazel test //...
-```
-
-or in a Docker container:
+To run standard test suites, you can use:
 
 ```
 make unit-tests
 make tests
 ```
 
-### Using remote execution
-
-If you have a [Remote Build Execution][rbe] environment, you can use it to speed
-up build and test cycles.
-
-You must authenticate with the project first:
+To run specific tests, you can specify the target:
 
 ```
-gcloud auth application-default login --no-launch-browser
+make test TARGETS="//runsc:version_test"
 ```
 
-Then invoke bazel with the following flags:
-
-```
---config=remote
---project_id=$PROJECT
---remote_instance_name=projects/$PROJECT/instances/default_instance
-```
-
-You can also add those flags to your local ~/.bazelrc to avoid needing to
-specify them each time on the command line.
-
 ### Using `go get`
 
 This project uses [bazel][bazel] to build and manage dependencies. A synthetic
@@ -128,7 +98,7 @@ development on this branch is not supported. Development should occur on the
 
 ## Community & Governance
 
-The governance model is documented in our [community][community] repository.
+See [GOVERNANCE.md](GOVERNANCE.md) for project governance information.
 
 The [gvisor-users mailing list][gvisor-users-list] and
 [gvisor-dev mailing list][gvisor-dev-list] are good starting points for
@@ -145,12 +115,9 @@ See [Contributing.md](CONTRIBUTING.md).
 [bazel]: https://bazel.build
 [community]: https://gvisor.googlesource.com/community
 [docker]: https://www.docker.com
-[git]: https://git-scm.com
 [gvisor-users-list]: https://groups.google.com/forum/#!forum/gvisor-users
+[gvisor-dev]: https://gvisor.dev
 [gvisor-dev-list]: https://groups.google.com/forum/#!forum/gvisor-dev
 [oci]: https://www.opencontainers.org
 [old-linux]: https://gvisor.dev/docs/user_guide/networking/#gso
-[python]: https://python.org
-[rbe]: https://blog.bazel.build/2018/10/05/remote-build-execution.html
 [sandbox]: https://en.wikipedia.org/wiki/Sandbox_(computer_security)
-[gvisor-dev]: https://gvisor.dev
diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
index e65801a7b..ef04b4ace 100755
--- a/benchmarks/tcp/tcp_benchmark.sh
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -94,6 +94,9 @@ while [ $# -gt 0 ]; do
     --cubic)
       netstack_opts="${netstack_opts} -cubic"
       ;;
+    --moderate-recv-buf)
+      netstack_opts="${netstack_opts} -moderate_recv_buf"
+      ;;
     --duration)
       shift
       [ "$#" -le 0 ] && echo "no duration provided" && exit 1
@@ -147,8 +150,9 @@ while [ $# -gt 0 ]; do
       echo " --client              use netstack as the client"
       echo " --ideal               reset all network emulation"
       echo " --server              use netstack as the server"
-      echo " --mtu                 set the mtu (bytes)"
+      echo " --mtu                 set the mtu (bytes)"      
       echo " --sack                enable SACK support"
+      echo " --moderate-recv-buf   enable TCP receive buffer auto-tuning"
       echo " --cubic               enable CUBIC congestion control for Netstack"
       echo " --duration            set the test duration (s)"
       echo " --latency             set the latency (ms)"
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
index dc1593b34..f5aa0b515 100644
--- a/benchmarks/tcp/tcp_proxy.go
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -56,6 +56,7 @@ var (
 	mask               = flag.Int("mask", 8, "mask size for address")
 	iface              = flag.String("iface", "", "network interface name to bind for netstack")
 	sack               = flag.Bool("sack", false, "enable SACK support for netstack")
+	moderateRecvBuf    = flag.Bool("moderate_recv_buf", false, "enable TCP Receive Buffer Auto-tuning")
 	cubic              = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
 	gso                = flag.Int("gso", 0, "GSO maximum size")
 	swgso              = flag.Bool("swgso", false, "software-level GSO")
@@ -231,6 +232,11 @@ func newNetstackImpl(mode string) (impl, error) {
 		return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err)
 	}
 
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(*moderateRecvBuf)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
 	// Set Congestion Control to cubic if requested.
 	if *cubic {
 		if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile
index e935c5ddc..f29cfa156 100644
--- a/benchmarks/workloads/absl/Dockerfile
+++ b/benchmarks/workloads/absl/Dockerfile
@@ -16,9 +16,10 @@ RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27
 RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh
 RUN ./bazel-0.27.0-installer-linux-x86_64.sh
 
-RUN git clone https://github.com/abseil/abseil-cpp.git
+RUN mkdir abseil-cpp && cd abseil-cpp \
+    && git init && git remote add origin https://github.com/abseil/abseil-cpp.git \
+    && git fetch --depth 1 origin 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f && git checkout FETCH_HEAD
 WORKDIR abseil-cpp
-RUN git checkout 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f
 RUN bazel clean
 ENV path "absl/base/..."
 CMD bazel build ${path} 2>&1
diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
index f637b6081..eeb3c7bbe 100644
--- a/benchmarks/workloads/ruby_template/Gemfile.lock
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -2,7 +2,7 @@ GEM
   remote: https://rubygems.org/
   specs:
     mustermann (1.0.3)
-    puma (3.12.4)
+    puma (3.12.6)
     rack (2.0.6)
     rack-protection (2.0.5)
       rack
diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile
index 262643b98..b5763e8ae 100644
--- a/benchmarks/workloads/tensorflow/Dockerfile
+++ b/benchmarks/workloads/tensorflow/Dockerfile
@@ -2,7 +2,7 @@ FROM tensorflow/tensorflow:1.13.2
 
 RUN apt-get update \
     && apt-get install -y git
-RUN git clone https://github.com/aymericdamien/TensorFlow-Examples.git
+RUN git clone --depth 1 https://github.com/aymericdamien/TensorFlow-Examples.git
 RUN python -m pip install -U pip setuptools
 RUN python -m pip install matplotlib
 
diff --git a/g3doc/BUILD b/g3doc/BUILD
index 24177ad06..dbbf96204 100644
--- a/g3doc/BUILD
+++ b/g3doc/BUILD
@@ -9,6 +9,10 @@ doc(
     name = "index",
     src = "README.md",
     category = "Project",
+    data = glob([
+        "*.png",
+        "*.svg",
+    ]),
     permalink = "/docs/",
     weight = "0",
 )
diff --git a/g3doc/architecture_guide/Layers.png b/g3doc/Layers.png
index 308c6c451..308c6c451 100644
--- a/g3doc/architecture_guide/Layers.png
+++ b/g3doc/Layers.png
diff --git a/g3doc/architecture_guide/Layers.svg b/g3doc/Layers.svg
index 0a366f841..0a366f841 100644
--- a/g3doc/architecture_guide/Layers.svg
+++ b/g3doc/Layers.svg
diff --git a/g3doc/architecture_guide/Machine-Virtualization.png b/g3doc/Machine-Virtualization.png
index 1ba2ed6b2..1ba2ed6b2 100644
--- a/g3doc/architecture_guide/Machine-Virtualization.png
+++ b/g3doc/Machine-Virtualization.png
diff --git a/g3doc/architecture_guide/Machine-Virtualization.svg b/g3doc/Machine-Virtualization.svg
index 5352da07b..5352da07b 100644
--- a/g3doc/architecture_guide/Machine-Virtualization.svg
+++ b/g3doc/Machine-Virtualization.svg
diff --git a/g3doc/README.md b/g3doc/README.md
index 7999f5d47..304a91493 100644
--- a/g3doc/README.md
+++ b/g3doc/README.md
@@ -1,6 +1,6 @@
 # What is gVisor?
 
-gVisor is a user-space kernel, written in Go, that implements a substantial
+gVisor is an application kernel, written in Go, that implements a substantial
 portion of the [Linux system call interface][linux]. It provides an additional
 layer of isolation between running applications and the host operating system.
 
@@ -9,19 +9,160 @@ that makes it easy to work with existing container tooling. The `runsc` runtime
 integrates with Docker and Kubernetes, making it simple to run sandboxed
 containers.
 
-gVisor takes a distinct approach to container sandboxing and makes a different
-set of technical trade-offs compared to existing sandbox technologies, thus
-providing new tools and ideas for the container security landscape.
-
 gVisor can be used with Docker, Kubernetes, or directly using `runsc`. Use the
 links below to see detailed instructions for each of them:
 
-*   [Docker](./user_guide/quick_start/docker/): The quickest and easiest way to
-    get started.
-*   [Kubernetes](./user_guide/quick_start/kubernetes/): Isolate Pods in your K8s
-    cluster with gVisor.
-*   [OCI Quick Start](./user_guide/quick_start/oci/): Expert mode. Customize
+*   [Docker](./user_guide/quick_start/docker.md): The quickest and easiest way
+    to get started.
+*   [Kubernetes](./user_guide/quick_start/kubernetes.md): Isolate Pods in your
+    K8s cluster with gVisor.
+*   [OCI Quick Start](./user_guide/quick_start/oci.md): Expert mode. Customize
     gVisor for your environment.
 
+## What does gVisor do?
+
+gVisor provides a virtualized environment in order to sandbox containers. The
+system interfaces normally implemented by the host kernel are moved into a
+distinct, per-sandbox application kernel in order to minimize the risk of an
+container escape exploit. gVisor does not introduce large fixed overheads
+however, and still retains a process-like model with respect to resource
+utilization.
+
+## How is this different?
+
+Two other approaches are commonly taken to provide stronger isolation than
+native containers.
+
+**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes
+virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This
+virtualized hardware is generally enlightened (paravirtualized) and additional
+mechanisms can be used to improve the visibility between the guest and host
+(e.g. balloon drivers, paravirtualized spinlocks). Running containers in
+distinct virtual machines can provide great isolation, compatibility and
+performance (though nested virtualization may bring challenges in this area),
+but for containers it often requires additional proxies and agents, and may
+require a larger resource footprint and slower start-up times.
+
+![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization")
+
+**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and
+[AppArmor][apparmor], allows the specification of a fine-grained security policy
+for an application or container. These schemes typically rely on hooks
+implemented inside the host kernel to enforce the rules. If the surface can be
+made small enough, then this is an excellent way to sandbox applications and
+maintain native performance. However, in practice it can be extremely difficult
+(if not impossible) to reliably define a policy for arbitrary, previously
+unknown applications, making this approach challenging to apply universally.
+
+![Rule-based execution](Rule-Based-Execution.png "Rule-based execution")
+
+Rule-based execution is often combined with additional layers for
+defense-in-depth.
+
+**gVisor** provides a third isolation mechanism, distinct from those above.
+
+gVisor intercepts application system calls and acts as the guest kernel, without
+the need for translation through virtualized hardware. gVisor may be thought of
+as either a merged guest kernel and VMM, or as seccomp on steroids. This
+architecture allows it to provide a flexible resource footprint (i.e. one based
+on threads and memory mappings, not fixed guest physical resources) while also
+lowering the fixed costs of virtualization. However, this comes at the price of
+reduced application compatibility and higher per-system call overhead.
+
+![gVisor](Layers.png "gVisor")
+
+On top of this, gVisor employs rule-based execution to provide defense-in-depth
+(details below).
+
+gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML
+virtualizes hardware internally and thus provides a fixed resource footprint.
+
+Each of the above approaches may excel in distinct scenarios. For example,
+machine-level virtualization will face challenges achieving high density, while
+gVisor may provide poor performance for system call heavy workloads.
+
+## Why Go?
+
+gVisor is written in [Go][golang] in order to avoid security pitfalls that can
+plague kernels. With Go, there are strong types, built-in bounds checks, no
+uninitialized variables, no use-after-free, no stack overflow, and a built-in
+race detector. However, the use of Go has its challenges, and the runtime often
+introduces performance overhead.
+
+## What are the different components?
+
+A gVisor sandbox consists of multiple processes. These processes collectively
+comprise an environment in which one or more containers can be run.
+
+Each sandbox has its own isolated instance of:
+
+*   The **Sentry**, which is a kernel that runs the containers and intercepts
+    and responds to system calls made by the application.
+
+Each container running in the sandbox has its own isolated instance of:
+
+*   A **Gofer** which provides file system access to the containers.
+
+![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram")
+
+## What is runsc?
+
+The entrypoint to running a sandboxed container is the `runsc` executable.
+`runsc` implements the [Open Container Initiative (OCI)][oci] runtime
+specification, which is used by Docker and Kubernetes. This means that OCI
+compatible _filesystem bundles_ can be run by `runsc`. Filesystem bundles are
+comprised of a `config.json` file containing container configuration, and a root
+filesystem for the container. Please see the [OCI runtime spec][runtime-spec]
+for more information on filesystem bundles. `runsc` implements multiple commands
+that perform various functions such as starting, stopping, listing, and querying
+the status of containers.
+
+### Sentry
+
+<a name="sentry"></a> <!-- For deep linking. -->
+
+The Sentry is the largest component of gVisor. It can be thought of as a
+application kernel. The Sentry implements all the kernel functionality needed by
+the application, including: system calls, signal delivery, memory management and
+page faulting logic, the threading model, and more.
+
+When the application makes a system call, the
+[Platform](./architecture_guide/platforms.md) redirects the call to the Sentry,
+which will do the necessary work to service it. It is important to note that the
+Sentry does not pass system calls through to the host kernel. As a userspace
+application, the Sentry will make some host system calls to support its
+operation, but it does not allow the application to directly control the system
+calls it makes. For example, the Sentry is not able to open files directly; file
+system operations that extend beyond the sandbox (not internal `/proc` files,
+pipes, etc) are sent to the Gofer, described below.
+
+### Gofer
+
+<a name="gofer"></a> <!-- For deep linking. -->
+
+The Gofer is a standard host process which is started with each container and
+communicates with the Sentry via the [9P protocol][9p] over a socket or shared
+memory channel. The Sentry process is started in a restricted seccomp container
+without access to file system resources. The Gofer mediates all access to the
+these resources, providing an additional level of isolation.
+
+### Application
+
+The application is a normal Linux binary provided to gVisor in an OCI runtime
+bundle. gVisor aims to provide an environment equivalent to Linux v4.4, so
+applications should be able to run unmodified. However, gVisor does not
+presently implement every system call, `/proc` file, or `/sys` file so some
+incompatibilities may occur. See [Commpatibility](./user_guide/compatibility.md)
+for more information.
+
+[9p]: https://en.wikipedia.org/wiki/9P_(protocol)
+[apparmor]: https://wiki.ubuntu.com/AppArmor
+[golang]: https://golang.org
+[kvm]: https://www.linux-kvm.org
 [linux]: https://en.wikipedia.org/wiki/Linux_kernel_interfaces
 [oci]: https://www.opencontainers.org
+[runtime-spec]: https://github.com/opencontainers/runtime-spec
+[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt
+[selinux]: https://selinuxproject.org
+[uml]: http://user-mode-linux.sourceforge.net/
+[xen]: https://www.xenproject.org
diff --git a/g3doc/architecture_guide/Rule-Based-Execution.png b/g3doc/Rule-Based-Execution.png
index b42654a90..b42654a90 100644
--- a/g3doc/architecture_guide/Rule-Based-Execution.png
+++ b/g3doc/Rule-Based-Execution.png
diff --git a/g3doc/architecture_guide/Rule-Based-Execution.svg b/g3doc/Rule-Based-Execution.svg
index bd6717043..bd6717043 100644
--- a/g3doc/architecture_guide/Rule-Based-Execution.svg
+++ b/g3doc/Rule-Based-Execution.svg
diff --git a/g3doc/architecture_guide/Sentry-Gofer.png b/g3doc/Sentry-Gofer.png
index ca2c27ef7..ca2c27ef7 100644
--- a/g3doc/architecture_guide/Sentry-Gofer.png
+++ b/g3doc/Sentry-Gofer.png
diff --git a/g3doc/architecture_guide/Sentry-Gofer.svg b/g3doc/Sentry-Gofer.svg
index 5c10750d2..5c10750d2 100644
--- a/g3doc/architecture_guide/Sentry-Gofer.svg
+++ b/g3doc/Sentry-Gofer.svg
diff --git a/g3doc/architecture_guide/BUILD b/g3doc/architecture_guide/BUILD
index 72038305b..404f627a4 100644
--- a/g3doc/architecture_guide/BUILD
+++ b/g3doc/architecture_guide/BUILD
@@ -6,30 +6,12 @@ package(
 )
 
 doc(
-    name = "index",
-    src = "README.md",
-    category = "Architecture Guide",
-    data = [
-        "Layers.png",
-        "Layers.svg",
-        "Machine-Virtualization.png",
-        "Machine-Virtualization.svg",
-        "Rule-Based-Execution.png",
-        "Rule-Based-Execution.svg",
-        "Sentry-Gofer.png",
-        "Sentry-Gofer.svg",
-    ],
-    permalink = "/docs/architecture_guide/",
-    weight = "0",
-)
-
-doc(
     name = "platforms",
     src = "platforms.md",
     category = "Architecture Guide",
     data = [
-        "Sentry-Gofer.png",
-        "Sentry-Gofer.svg",
+        "platforms.png",
+        "platforms.svg",
     ],
     permalink = "/docs/architecture_guide/platforms/",
     weight = "40",
@@ -39,6 +21,10 @@ doc(
     name = "resources",
     src = "resources.md",
     category = "Architecture Guide",
+    data = [
+        "resources.png",
+        "resources.svg",
+    ],
     permalink = "/docs/architecture_guide/resources/",
     weight = "30",
 )
@@ -48,8 +34,8 @@ doc(
     src = "security.md",
     category = "Architecture Guide",
     data = [
-        "Layers.png",
-        "Layers.svg",
+        "security.png",
+        "security.svg",
     ],
     permalink = "/docs/architecture_guide/security/",
     weight = "10",
diff --git a/g3doc/architecture_guide/README.md b/g3doc/architecture_guide/README.md
deleted file mode 100644
index 1364a5358..000000000
--- a/g3doc/architecture_guide/README.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# Overview
-
-gVisor provides a virtualized environment in order to sandbox untrusted
-containers. The system interfaces normally implemented by the host kernel are
-moved into a distinct, per-sandbox user space kernel in order to minimize the
-risk of an exploit. gVisor does not introduce large fixed overheads however, and
-still retains a process-like model with respect to resource utilization.
-
-## How is this different?
-
-Two other approaches are commonly taken to provide stronger isolation than
-native containers.
-
-**Machine-level virtualization**, such as [KVM][kvm] and [Xen][xen], exposes
-virtualized hardware to a guest kernel via a Virtual Machine Monitor (VMM). This
-virtualized hardware is generally enlightened (paravirtualized) and additional
-mechanisms can be used to improve the visibility between the guest and host
-(e.g. balloon drivers, paravirtualized spinlocks). Running containers in
-distinct virtual machines can provide great isolation, compatibility and
-performance (though nested virtualization may bring challenges in this area),
-but for containers it often requires additional proxies and agents, and may
-require a larger resource footprint and slower start-up times.
-
-![Machine-level virtualization](Machine-Virtualization.png "Machine-level virtualization")
-
-**Rule-based execution**, such as [seccomp][seccomp], [SELinux][selinux] and
-[AppArmor][apparmor], allows the specification of a fine-grained security policy
-for an application or container. These schemes typically rely on hooks
-implemented inside the host kernel to enforce the rules. If the surface can be
-made small enough (i.e. a sufficiently complete policy defined), then this is an
-excellent way to sandbox applications and maintain native performance. However,
-in practice it can be extremely difficult (if not impossible) to reliably define
-a policy for arbitrary, previously unknown applications, making this approach
-challenging to apply universally.
-
-![Rule-based execution](Rule-Based-Execution.png "Rule-based execution")
-
-Rule-based execution is often combined with additional layers for
-defense-in-depth.
-
-**gVisor** provides a third isolation mechanism, distinct from those above.
-
-gVisor intercepts application system calls and acts as the guest kernel, without
-the need for translation through virtualized hardware. gVisor may be thought of
-as either a merged guest kernel and VMM, or as seccomp on steroids. This
-architecture allows it to provide a flexible resource footprint (i.e. one based
-on threads and memory mappings, not fixed guest physical resources) while also
-lowering the fixed costs of virtualization. However, this comes at the price of
-reduced application compatibility and higher per-system call overhead.
-
-![gVisor](Layers.png "gVisor")
-
-On top of this, gVisor employs rule-based execution to provide defense-in-depth
-(details below).
-
-gVisor's approach is similar to [User Mode Linux (UML)][uml], although UML
-virtualizes hardware internally and thus provides a fixed resource footprint.
-
-Each of the above approaches may excel in distinct scenarios. For example,
-machine-level virtualization will face challenges achieving high density, while
-gVisor may provide poor performance for system call heavy workloads.
-
-### Why Go?
-
-gVisor is written in [Go][golang] in order to avoid security pitfalls that can
-plague kernels. With Go, there are strong types, built-in bounds checks, no
-uninitialized variables, no use-after-free, no stack overflow, and a built-in
-race detector. (The use of Go has its challenges too, and isn't free.)
-
-### What about Gofers?
-
-<a name="gofer"></a> <!-- For deep linking. -->
-
-[apparmor]: https://wiki.ubuntu.com/AppArmor
-[golang]: https://golang.org
-[kvm]: https://www.linux-kvm.org
-[seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt
-[selinux]: https://selinuxproject.org
-[uml]: http://user-mode-linux.sourceforge.net/
-[xen]: https://www.xenproject.org
diff --git a/g3doc/architecture_guide/performance.md b/g3doc/architecture_guide/performance.md
index 3862d78ee..39dbb0045 100644
--- a/g3doc/architecture_guide/performance.md
+++ b/g3doc/architecture_guide/performance.md
@@ -13,12 +13,13 @@ forms: additional cycles and memory usage, which may manifest as increased
 latency, reduced throughput or density, or not at all. In general, these costs
 come from two different sources.
 
-First, the existence of the [Sentry](../) means that additional memory will be
-required, and application system calls must traverse additional layers of
-software. The design emphasizes [security](../security/) and therefore we chose
-to use a language for the Sentry that provides benefits in this domain but may
-not yet offer the raw performance of other choices. Costs imposed by these
-design choices are **structural costs**.
+First, the existence of the [Sentry](../README.md#sentry) means that additional
+memory will be required, and application system calls must traverse additional
+layers of software. The design emphasizes
+[security](/docs/architecture_guide/security/) and therefore we chose to use a
+language for the Sentry that provides benefits in this domain but may not yet
+offer the raw performance of other choices. Costs imposed by these design
+choices are **structural costs**.
 
 Second, as gVisor is an independent implementation of the system call surface,
 many of the subsystems or specific calls are not as optimized as more mature
@@ -50,7 +51,7 @@ Virtual Machines (VMs) with the following specifications:
 
 Through this document, `runsc` is used to indicate the runtime provided by
 gVisor. When relevant, we use the name `runsc-platform` to describe a specific
-[platform choice](../platforms/).
+[platform choice](/docs/architecture_guide/platforms/).
 
 **Except where specified, all tests below are conducted with the `ptrace`
 platform. The `ptrace` platform works everywhere and does not require hardware
@@ -131,11 +132,11 @@ full start-up and run time for the workload, which trains a model.
 ## System calls
 
 Some **structural costs** of gVisor are heavily influenced by the
-[platform choice](../platforms/), which implements system call interception.
-Today, gVisor supports a variety of platforms. These platforms present distinct
-performance, compatibility and security trade-offs. For example, the KVM
-platform has low overhead system call interception but runs poorly with nested
-virtualization.
+[platform choice](/docs/architecture_guide/platforms/), which implements system
+call interception. Today, gVisor supports a variety of platforms. These
+platforms present distinct performance, compatibility and security trade-offs.
+For example, the KVM platform has low overhead system call interception but runs
+poorly with nested virtualization.
 
 {% include graph.html id="syscall" url="/performance/syscall.csv" title="perf.py
 syscall --runtime=runc --runtime=runsc-ptrace --runtime=runsc-kvm" y_min="100"
@@ -163,7 +164,8 @@ overhead.
 
 Some of these costs above are **structural costs**, and `redis` is likely to
 remain a challenging performance scenario. However, optimizing the
-[platform](../platforms/) will also have a dramatic impact.
+[platform](/docs/architecture_guide/platforms/) will also have a dramatic
+impact.
 
 ## Start-up time
 
@@ -184,7 +186,7 @@ similarly loads a number of modules and binds an HTTP server.
 > Note: most of the time overhead above is associated Docker itself. This is
 > evident with the empty `runc` benchmark. To avoid these costs with `runsc`,
 > you may also consider using `runsc do` mode or invoking the
-> [OCI runtime](../../user_guide/quick_start/oci/) directly.
+> [OCI runtime](../user_guide/quick_start/oci.md) directly.
 
 ## Network
 
@@ -222,8 +224,9 @@ In terms of raw disk I/O, gVisor does not introduce significant fundamental
 overhead. For general file operations, gVisor introduces a small fixed overhead
 for data that transitions across the sandbox boundary. This manifests as
 **structural costs** in some cases, since these operations must be routed
-through the [Gofer](../) as a result of our [security model](../security/), but
-in most cases are dominated by **implementation costs**, due to an internal
+through the [Gofer](../README.md#gofer) as a result of our
+[Security Model](/docs/architecture_guide/security/), but in most cases are
+dominated by **implementation costs**, due to an internal
 [Virtual File System][vfs] (VFS) implementation that needs improvement.
 
 {% include graph.html id="fio-bw" url="/performance/fio.csv" title="perf.py fio
diff --git a/g3doc/architecture_guide/platforms.md b/g3doc/architecture_guide/platforms.md
index 6e63da8ce..d112c9a28 100644
--- a/g3doc/architecture_guide/platforms.md
+++ b/g3doc/architecture_guide/platforms.md
@@ -1,86 +1,61 @@
 # Platform Guide
 
-A gVisor sandbox consists of multiple processes when running. These processes
-collectively comprise a shared environment in which one or more containers can
-be run.
+[TOC]
 
-Each sandbox has its own isolated instance of:
-
-*   The **Sentry**, A user-space kernel that runs the container and intercepts
-    and responds to system calls made by the application.
-
-Each container running in the sandbox has its own isolated instance of:
-
-*   A **Gofer** which provides file system access to the container.
-
-![gVisor architecture diagram](Sentry-Gofer.png "gVisor architecture diagram")
-
-## runsc
-
-The entrypoint to running a sandboxed container is the `runsc` executable.
-`runsc` implements the [Open Container Initiative (OCI)][oci] runtime
-specification. This means that OCI compatible _filesystem bundles_ can be run by
-`runsc`. Filesystem bundles are comprised of a `config.json` file containing
-container configuration, and a root filesystem for the container. Please see the
-[OCI runtime spec][runtime-spec] for more information on filesystem bundles.
-`runsc` implements multiple commands that perform various functions such as
-starting, stopping, listing, and querying the status of containers.
+gVisor requires a platform to implement interception of syscalls, basic context
+switching, and memory mapping functionality. Internally, gVisor uses an
+abstraction sensibly called [Platform][platform]. A simplified version of this
+interface looks like:
 
-## Sentry
+```golang
+type Platform interface {
+    NewAddressSpace() (AddressSpace, error)
+    NewContext() Context
+}
 
-The Sentry is the largest component of gVisor. It can be thought of as a
-userspace OS kernel. The Sentry implements all the kernel functionality needed
-by the untrusted application. It implements all of the supported system calls,
-signal delivery, memory management and page faulting logic, the threading model,
-and more.
+type Context interface {
+    Switch(as AddressSpace, ac arch.Context) (..., error)
+}
 
-When the untrusted application makes a system call, the currently used platform
-redirects the call to the Sentry, which will do the necessary work to service
-it. It is important to note that the Sentry will not simply pass through system
-calls to the host kernel. As a userspace application, the Sentry will make some
-host system calls to support its operation, but it will not allow the
-application to directly control the system calls it makes.
+type AddressSpace interface {
+    MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, ...) error
+    Unmap(addr usermem.Addr, length uint64)
+}
+```
 
-The Sentry aims to present an equivalent environment to (upstream) Linux v4.4.
+There are a number of different ways to implement this interface that come with
+various trade-offs, generally around performance and hardware requirements.
 
-File system operations that extend beyond the sandbox (not internal /proc files,
-pipes, etc) are sent to the Gofer, described below.
+## Implementations
 
-## Platforms
+The choice of platform depends on the context in which `runsc` is executing. In
+general, virtualized platforms may be limited to platforms that do not require
+hardware virtualized support (since the hardware is already in use):
 
-gVisor requires a platform to implement interception of syscalls, basic context
-switching, and memory mapping functionality.
+![Platforms](platforms.png "Platform examples.")
 
 ### ptrace
 
-The ptrace platform uses `PTRACE_SYSEMU` to execute user code without allowing
-it to execute host system calls. This platform can run anywhere that ptrace
-works (even VMs without nested virtualization).
-
-### KVM (experimental)
+The ptrace platform uses [PTRACE_SYSEMU][ptrace] to execute user code without
+allowing it to execute host system calls. This platform can run anywhere that
+`ptrace` works (even VMs without nested virtualization), which is ubiquitous.
 
-The KVM platform allows the Sentry to act as both guest OS and VMM, switching
-back and forth between the two worlds seamlessly. The KVM platform can run on
-bare-metal or in a VM with nested virtualization enabled. While there is no
-virtualized hardware layer -- the sandbox retains a process model -- gVisor
-leverages virtualization extensions available on modern processors in order to
-improve isolation and performance of address space switches.
+Unfortunately, the ptrace platform has high context switch overhead, so system
+call-heavy applications may pay a [performance penalty](./performance.md).
 
-## Gofer
+### KVM
 
-The Gofer is a normal host Linux process. The Gofer is started with each sandbox
-and connected to the Sentry. The Sentry process is started in a restricted
-seccomp container without access to file system resources. The Gofer provides
-the Sentry access to file system resources via the 9P protocol and provides an
-additional level of isolation.
+The KVM platform uses the kernel's [KVM][kvm] functionality to allow the Sentry
+to act as both guest OS and VMM. The KVM platform can run on bare-metal or in a
+VM with nested virtualization enabled. While there is no virtualized hardware
+layer -- the sandbox retains a process model -- gVisor leverages virtualization
+extensions available on modern processors in order to improve isolation and
+performance of address space switches.
 
-## Application
+## Changing Platforms
 
-The application (aka the untrusted application) is a normal Linux binary
-provided to gVisor in an OCI runtime bundle. gVisor aims to provide an
-environment equivalent to Linux v4.4, so applications should be able to run
-unmodified. However, gVisor does not presently implement every system call,
-/proc file, or /sys file so some incompatibilities may occur.
+See [Changing Platforms](../user_guide/platforms.md).
 
-[oci]: https://www.opencontainers.org
-[runtime-spec]: https://github.com/opencontainers/runtime-spec
+[kvm]: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt
+[platform]: https://cs.opensource.google/gvisor/gvisor/+/release-20190304.1:pkg/sentry/platform/platform.go;l=33
+[ptrace]: http://man7.org/linux/man-pages/man2/ptrace.2.html
diff --git a/g3doc/architecture_guide/platforms.png b/g3doc/architecture_guide/platforms.png
new file mode 100644
index 000000000..005d56feb
--- /dev/null
+++ b/g3doc/architecture_guide/platforms.png
diff --git a/g3doc/architecture_guide/platforms.svg b/g3doc/architecture_guide/platforms.svg
new file mode 100644
index 000000000..b0bac9ba7
--- /dev/null
+++ b/g3doc/architecture_guide/platforms.svg
@@ -0,0 +1,334 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="142.67763mm"
+   height="67.063133mm"
+   viewBox="0 0 142.67763 67.063134"
+   version="1.1"
+   id="svg8"
+   inkscape:export-filename="/home/ascannell/resources.png"
+   inkscape:export-xdpi="53.50127"
+   inkscape:export-ydpi="53.50127"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="platforms.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="86.443612"
+     inkscape:cy="102.88104"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-36.081387,-98.953278)">
+    <rect
+       id="rect10"
+       width="33.408691"
+       height="33.408691"
+       x="36.081387"
+       y="120.06757"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.23881446"
+       id="rect16"
+       width="142.45465"
+       height="10.423517"
+       x="36.08139"
+       y="155.5929" />
+    <rect
+       id="rect10-7"
+       width="30.52453"
+       height="18.976137"
+       x="37.416695"
+       y="121.65508"
+       style="fill:#ff8080;stroke-width:0.19060372" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="41.03727"
+       y="148.58765"
+       id="text65"><tspan
+         sodipodi:role="line"
+         id="tspan63"
+         x="41.03727"
+         y="148.58765"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="45.473087"
+       y="132.50232"
+       id="text123"><tspan
+         sodipodi:role="line"
+         id="tspan121"
+         x="45.473087"
+         y="132.50232"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055"
+       x="97.768547"
+       y="163.15665"
+       id="text163"><tspan
+         sodipodi:role="line"
+         id="tspan161"
+         x="97.768547"
+         y="163.15665"
+         style="stroke-width:0.16098055">host</tspan></text>
+    <rect
+       style="fill:#e9afdd;stroke-width:0.39185274"
+       id="rect16-7"
+       width="72.9646"
+       height="54.79026"
+       x="105.79441"
+       y="98.953278" />
+    <rect
+       id="rect10-5"
+       width="33.408691"
+       height="33.408691"
+       x="108.24348"
+       y="100.53072"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-6"
+       width="30.52453"
+       height="20.045216"
+       x="109.57877"
+       y="102.11823"
+       style="fill:#ff8080;stroke-width:0.19589928" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="112.86765"
+       y="129.01863"
+       id="text65-2"><tspan
+         sodipodi:role="line"
+         id="tspan63-9"
+         x="112.86765"
+         y="129.01863"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="117.63519"
+       y="114.02371"
+       id="text123-1"><tspan
+         sodipodi:role="line"
+         id="tspan121-2"
+         x="117.63519"
+         y="114.02371"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7"
+       width="11.815663"
+       height="8.0126781"
+       x="54.538059"
+       y="143.27702"
+       style="fill:#aaccff;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:4.35074377px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.10876859"
+       x="55.931114"
+       y="148.90578"
+       id="text144"><tspan
+         sodipodi:role="line"
+         id="tspan142"
+         x="55.931114"
+         y="148.90578"
+         style="stroke-width:0.10876859">KVM</tspan></text>
+    <rect
+       id="rect10-6"
+       width="33.408691"
+       height="33.408691"
+       x="71.044685"
+       y="119.73112"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-0"
+       width="30.52453"
+       height="18.976137"
+       x="72.37999"
+       y="121.31865"
+       style="fill:#ff8080;stroke-width:0.19060372" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="76.000565"
+       y="148.25128"
+       id="text65-6"><tspan
+         sodipodi:role="line"
+         id="tspan63-2"
+         x="76.000565"
+         y="148.25128"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="80.436386"
+       y="132.16595"
+       id="text123-6"><tspan
+         sodipodi:role="line"
+         id="tspan121-1"
+         x="80.436386"
+         y="132.16595"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7-8"
+       width="11.815664"
+       height="8.0126781"
+       x="89.501358"
+       y="142.94067"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="89.92292"
+       y="147.89806"
+       id="text144-7"><tspan
+         sodipodi:role="line"
+         id="tspan142-9"
+         x="89.92292"
+         y="147.89806"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+    <rect
+       id="rect10-7-7-8-3"
+       width="11.815665"
+       height="8.0126781"
+       x="127.08897"
+       y="123.97878"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="127.51052"
+       y="128.9362"
+       id="text144-7-7"><tspan
+         sodipodi:role="line"
+         id="tspan142-9-5"
+         x="127.51052"
+         y="128.9362"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:5.45061255px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.13626531"
+       x="138.49318"
+       y="152.11841"
+       id="text229"><tspan
+         sodipodi:role="line"
+         id="tspan227"
+         x="138.49318"
+         y="152.11841"
+         style="stroke-width:0.13626531">VM</tspan></text>
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.16518368"
+       id="rect16-9"
+       width="68.15374"
+       height="10.423517"
+       x="108.24348"
+       y="134.99774" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.17854786px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.15446369"
+       x="132.91473"
+       y="142.07658"
+       id="text248"><tspan
+         sodipodi:role="line"
+         id="tspan246"
+         x="132.91473"
+         y="142.07658"
+         style="stroke-width:0.15446369">guest</tspan></text>
+    <rect
+       id="rect10-5-2"
+       width="33.408691"
+       height="33.408691"
+       x="143.32402"
+       y="100.35877"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <rect
+       id="rect10-7-6-2"
+       width="30.52453"
+       height="20.045216"
+       x="144.65933"
+       y="101.94627"
+       style="fill:#ff8080;stroke-width:0.19589929" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="147.94815"
+       y="128.84665"
+       id="text65-2-8"><tspan
+         sodipodi:role="line"
+         id="tspan63-9-9"
+         x="147.94815"
+         y="128.84665"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="152.71565"
+       y="113.85176"
+       id="text123-1-7"><tspan
+         sodipodi:role="line"
+         id="tspan121-2-3"
+         x="152.71565"
+         y="113.85176"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <rect
+       id="rect10-7-7-8-3-6"
+       width="11.815666"
+       height="8.0126781"
+       x="162.16933"
+       y="123.80682"
+       style="fill:#ffeeaa;stroke-width:0.07705856" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.39456654px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08486416"
+       x="162.59088"
+       y="128.76421"
+       id="text144-7-7-1"><tspan
+         sodipodi:role="line"
+         id="tspan142-9-5-2"
+         x="162.59088"
+         y="128.76421"
+         style="stroke-width:0.08486416">ptrace</tspan></text>
+  </g>
+</svg>
diff --git a/g3doc/architecture_guide/resources.md b/g3doc/architecture_guide/resources.md
index 894f995ae..1dec37bd1 100644
--- a/g3doc/architecture_guide/resources.md
+++ b/g3doc/architecture_guide/resources.md
@@ -10,9 +10,10 @@ sandbox to be highly dynamic in terms of resource usage: spanning a large number
 of cores and large amount of memory when busy, and yielding those resources back
 to the host when not.
 
-Some of the details here may depend on the [platform](../platforms/), but in
-general this page describes the resource model used by gVisor. If you're not
-familiar with the terms here, uou may want to start with the [Overview](../).
+In order words, the shape of the sandbox should closely track the shape of the
+sandboxed process:
+
+![Resource model](resources.png "Workloads of different shapes.")
 
 ## Processes
 
@@ -23,9 +24,9 @@ the sandbox (e.g. via a [Docker exec][exec]).
 
 ## Networking
 
-Similarly to processes, the sandbox attaches a network endpoint to the system,
-but runs it's own network stack. All network resources, other than packets in
-flight, exist only inside the sandbox, bound by relevant resource limits.
+The sandbox attaches a network endpoint to the system, but runs it's own network
+stack. All network resources, other than packets in flight on the host, exist
+only inside the sandbox, bound by relevant resource limits.
 
 You can interact with network endpoints exposed by the sandbox, just as you
 would any other container, but network introspection similarly requires entering
@@ -33,15 +34,14 @@ the sandbox.
 
 ## Files
 
-Files may be backed by different implementations. For host-native files (where a
-file descriptor is available), the Gofer may return a file descriptor to the
-Sentry via [SCM_RIGHTS][scmrights][^1].
+Files in the sandbox may be backed by different implementations. For host-native
+files (where a file descriptor is available), the Gofer may return a file
+descriptor to the Sentry via [SCM_RIGHTS][scmrights][^1].
 
 These files may be read from and written to through standard system calls, and
 also mapped into the associated application's address space. This allows the
 same host memory to be shared across multiple sandboxes, although this mechanism
-does not preclude the use of side-channels (see the
-[security model](../security/)).
+does not preclude the use of side-channels (see [Security Model](./security.md).
 
 Note that some file systems exist only within the context of the sandbox. For
 example, in many cases a `tmpfs` mount will be available at `/tmp` or
@@ -64,8 +64,9 @@ scheduling decisions about all application threads.
 ## Time
 
 Time in the sandbox is provided by the Sentry, through its own [vDSO][vdso] and
-timekeeping implementation. This is divorced from the host time, and no state is
-shared with the host, although the time will be initialized with the host clock.
+time-keeping implementation. This is distinct from the host time, and no state
+is shared with the host, although the time will be initialized with the host
+clock.
 
 The Sentry runs timers to note the passage of time, much like a kernel running
 on hardware (though the timers are software timers, in this case). These timers
diff --git a/g3doc/architecture_guide/resources.png b/g3doc/architecture_guide/resources.png
new file mode 100644
index 000000000..f715008ec
--- /dev/null
+++ b/g3doc/architecture_guide/resources.png
diff --git a/g3doc/architecture_guide/resources.svg b/g3doc/architecture_guide/resources.svg
new file mode 100644
index 000000000..fd7805d90
--- /dev/null
+++ b/g3doc/architecture_guide/resources.svg
@@ -0,0 +1,208 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="108.24417mm"
+   height="47.513165mm"
+   viewBox="0 0 108.24417 47.513165"
+   version="1.1"
+   id="svg8"
+   inkscape:export-filename="/home/ascannell/resources.png"
+   inkscape:export-xdpi="53.50127"
+   inkscape:export-ydpi="53.50127"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="resources.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="16.897058"
+     inkscape:cy="41.261746"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-36.081387,-118.50325)">
+    <rect
+       id="rect10"
+       width="33.408691"
+       height="33.408691"
+       x="36.081387"
+       y="120.06757"
+       style="fill:#44aa00;stroke-width:0.26458332" />
+    <circle
+       style="fill:#44aa00;stroke-width:0.21849461"
+       id="path12"
+       cx="87.958534"
+       cy="136.63828"
+       r="17.105247" />
+    <path
+       sodipodi:type="star"
+       style="fill:#44aa00;stroke-width:0.26458332"
+       id="path14"
+       sodipodi:sides="3"
+       sodipodi:cx="124.13387"
+       sodipodi:cy="141.81859"
+       sodipodi:r1="23.31534"
+       sodipodi:r2="11.65767"
+       sodipodi:arg1="0.52359878"
+       sodipodi:arg2="1.5707963"
+       inkscape:flatsided="false"
+       inkscape:rounded="0"
+       inkscape:randomized="0"
+       d="m 144.32555,153.47626 -20.19168,0 -20.19167,0 10.09583,-17.48651 10.09584,-17.4865 10.09584,17.4865 z"
+       inkscape:transform-center-x="1.8384776e-06"
+       inkscape:transform-center-y="-5.8288369" />
+    <rect
+       style="fill:#b3b3b3;stroke-width:0.20817307"
+       id="rect16"
+       width="108.24416"
+       height="10.423517"
+       x="36.08139"
+       y="155.5929" />
+    <path
+       sodipodi:type="star"
+       style="fill:#ff8080;stroke-width:0.20018946"
+       id="path14-3"
+       sodipodi:sides="3"
+       sodipodi:cx="124.13387"
+       sodipodi:cy="139.31911"
+       sodipodi:r1="17.640888"
+       sodipodi:r2="8.8204451"
+       sodipodi:arg1="0.52359878"
+       sodipodi:arg2="1.5707963"
+       inkscape:flatsided="false"
+       inkscape:rounded="0"
+       inkscape:randomized="0"
+       d="m 139.41133,148.13955 -15.27746,0 -15.27745,0 7.63872,-13.23067 7.63873,-13.23066 7.63873,13.23066 z"
+       inkscape:transform-center-x="3.9117172e-06"
+       inkscape:transform-center-y="-4.4102243" />
+    <circle
+       style="fill:#ff8080;stroke-width:0.18094084"
+       id="path12-6"
+       cx="87.93705"
+       cy="134.75125"
+       r="14.165282" />
+    <rect
+       id="rect10-7"
+       width="30.52453"
+       height="25.657875"
+       x="37.416695"
+       y="121.65508"
+       style="fill:#ff8080;stroke-width:0.22163473" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="47.387276"
+       y="151.7626"
+       id="text65"><tspan
+         sodipodi:role="line"
+         id="tspan63"
+         x="47.387276"
+         y="151.7626"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="82.156319"
+       y="151.71547"
+       id="text65-5"><tspan
+         sodipodi:role="line"
+         id="tspan63-3"
+         x="82.156319"
+         y="151.71547"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.40292525px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08507314"
+       x="118.66879"
+       y="151.71547"
+       id="text65-5-5"><tspan
+         sodipodi:role="line"
+         id="tspan63-3-6"
+         x="118.66879"
+         y="151.71547"
+         style="stroke-width:0.08507314">gVisor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="45.473087"
+       y="136.20644"
+       id="text123"><tspan
+         sodipodi:role="line"
+         id="tspan121"
+         x="45.473087"
+         y="136.20644"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="80.153076"
+       y="136.00925"
+       id="text123-1"><tspan
+         sodipodi:role="line"
+         id="tspan121-2"
+         x="80.153076"
+         y="136.00925"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.33113885px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.08327847"
+       x="116.50173"
+       y="138.68195"
+       id="text123-1-7"><tspan
+         sodipodi:role="line"
+         id="tspan121-2-0"
+         x="116.50173"
+         y="138.68195"
+         style="stroke-width:0.08327847">workload</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:6.43922186px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.16098055"
+       x="81.893562"
+       y="163.15665"
+       id="text163"><tspan
+         sodipodi:role="line"
+         id="tspan161"
+         x="81.893562"
+         y="163.15665"
+         style="stroke-width:0.16098055">host</tspan></text>
+  </g>
+</svg>
diff --git a/g3doc/architecture_guide/security.md b/g3doc/architecture_guide/security.md
index f78586291..b99b86332 100644
--- a/g3doc/architecture_guide/security.md
+++ b/g3doc/architecture_guide/security.md
@@ -86,15 +86,17 @@ a substitute for a secure architecture*.
 
 ## Goals: Limiting Exposure
 
-gVisor’s primary design goal is to minimize the System API attack vector while
-still providing a process model. There are two primary security principles that
-inform this design. First, the application’s direct interactions with the host
-System API are intercepted by the Sentry, which implements the System API
-instead. Second, the System API accessible to the Sentry itself is minimized to
-a safer, restricted set. The first principle minimizes the possibility of direct
-exploitation of the host System API by applications, and the second principle
-minimizes indirect exploitability, which is the exploitation by an exploited or
-buggy Sentry (e.g. chaining an exploit).
+![Threat model](security.png "Threat model.")
+
+gVisor’s primary design goal is to minimize the System API attack vector through
+multiple layers of defense, while still providing a process model. There are two
+primary security principles that inform this design. First, the application’s
+direct interactions with the host System API are intercepted by the Sentry,
+which implements the System API instead. Second, the System API accessible to
+the Sentry itself is minimized to a safer, restricted set. The first principle
+minimizes the possibility of direct exploitation of the host System API by
+applications, and the second principle minimizes indirect exploitability, which
+is the exploitation by an exploited or buggy Sentry (e.g. chaining an exploit).
 
 The first principle is similar to the security basis for a Virtual Machine (VM).
 With a VM, an application’s interactions with the host are replaced by
@@ -210,9 +212,9 @@ crashes are recorded and triaged to similarly identify material issues.
 ### Is this more or less secure than a Virtual Machine?
 
 The security of a VM depends to a large extent on what is exposed from the host
-kernel and user space support code. For example, device emulation code in the
+kernel and userspace support code. For example, device emulation code in the
 host kernel (e.g. APIC) or optimizations (e.g. vhost) can be more complex than a
-simple system call, and exploits carry the same risks. Similarly, the user space
+simple system call, and exploits carry the same risks. Similarly, the userspace
 support code is frequently unsandboxed, and exploits, while rare, may allow
 unfettered access to the system.
 
@@ -245,8 +247,8 @@ In gVisor, the platforms that use ptrace operate differently. The stubs that are
 traced are never allowed to continue execution into the host kernel and complete
 a call directly. Instead, all system calls are interpreted and handled by the
 Sentry itself, who reflects resulting register state back into the tracee before
-continuing execution in user space. This is very similar to the mechanism used
-by User-Mode Linux (UML).
+continuing execution in userspace. This is very similar to the mechanism used by
+User-Mode Linux (UML).
 
 [dirtycow]: https://en.wikipedia.org/wiki/Dirty_COW
 [clang]: https://en.wikipedia.org/wiki/C_(programming_language)
diff --git a/g3doc/architecture_guide/security.png b/g3doc/architecture_guide/security.png
new file mode 100644
index 000000000..c29befbf6
--- /dev/null
+++ b/g3doc/architecture_guide/security.png
diff --git a/g3doc/architecture_guide/security.svg b/g3doc/architecture_guide/security.svg
new file mode 100644
index 000000000..0575e2dec
--- /dev/null
+++ b/g3doc/architecture_guide/security.svg
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="92.963379mm"
+   height="107.18885mm"
+   viewBox="0 0 92.963379 107.18885"
+   version="1.1"
+   id="svg8"
+   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
+   sodipodi:docname="defense.svg">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.98994949"
+     inkscape:cx="-242.99254"
+     inkscape:cy="136.90181"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer4"
+     showgrid="false"
+     inkscape:object-nodes="true"
+     inkscape:window-width="1920"
+     inkscape:window-height="1005"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:groupmode="layer"
+     id="layer2"
+     inkscape:label="Layer 2"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       id="g4644"
+       style="fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+       transform="matrix(1,0,0,-1,2.138671,277.94235)">
+      <path
+         transform="scale(0.26458333)"
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z"
+         id="path4638" />
+      <path
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:3.77952766;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z"
+         transform="scale(0.26458333)"
+         id="path4640" />
+      <path
+         id="path4642"
+         d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z"
+         style="opacity:1;fill:none;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.25572576"
+         inkscape:connector-curvature="0" />
+    </g>
+  </g>
+  <g
+     inkscape:groupmode="layer"
+     id="layer3"
+     inkscape:label="Layer 3"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       id="g4554"
+       transform="matrix(-0.39771468,0.69855937,-0.69855937,-0.39771468,366.58103,126.65261)">
+      <g
+         id="g4662"
+         transform="translate(59.46839,130.66062)">
+        <path
+           inkscape:connector-curvature="0"
+           id="path4548"
+           transform="scale(0.26458333)"
+           d="M 398.57227,351.84766 224.7832,452.18359 398.57227,552.51953 572.35938,452.18359 Z"
+           style="opacity:1;fill:#0066ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.70182848;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+        <path
+           inkscape:connector-curvature="0"
+           id="path4550"
+           transform="scale(0.26458333)"
+           d="M 572.35938,452.18359 398.57227,552.51953 V 753.19141 L 572.35938,652.85547 Z"
+           style="opacity:1;fill:#0044aa;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:4.29276943;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+        <path
+           inkscape:connector-curvature="0"
+           style="opacity:1;fill:#5599ff;fill-opacity:0.34509804;stroke:#00a5ff;stroke-width:1.24402535;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+           d="m 59.473888,119.64024 45.981172,26.54722 v 53.09443 L 59.473888,172.73467 Z"
+           id="path4552" />
+      </g>
+    </g>
+  </g>
+  <g
+     inkscape:groupmode="layer"
+     id="layer4"
+     inkscape:label="Layer 4"
+     transform="translate(-61.112559,-78.160466)">
+    <path
+       style="fill:#e000ae;fill-opacity:1;stroke-width:0.12476727"
+       d="m 84.610811,107.36071 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m 40.674129,0 v 2.55773 2.55772 h 2.49535 2.49534 v -2.55772 -2.55773 h -2.49534 z m -35.558669,5.11545 v 2.55773 2.55773 h 2.49535 2.49534 v -2.55773 -2.55773 h -2.49534 z m 4.99069,5.11546 v 2.55773 2.55773 h -2.49534 -2.49535 v 2.49534 2.49535 h -2.55773 -2.55773 v 2.55773 2.55773 h -2.55773 -2.55773 v 10.16853 10.16853 h 2.55773 2.55773 v -7.67562 -7.67587 l 2.52654,0.0339 2.52654,0.0336 0.0327,5.08427 0.0327,5.08426 h 2.49388 2.49388 v 2.55919 2.5592 l 5.08427,-0.0327 5.084269,-0.0326 v -2.49534 -2.49535 l -5.084269,-0.0324 -5.08427,-0.0327 v -2.55626 -2.55651 h 12.726269 12.72626 v 2.55651 2.55626 l -5.05868,0.0327 -5.05893,0.0324 v 2.49535 2.49534 l 5.05893,0.0326 5.05868,0.0327 v -2.55919 -2.55919 h 2.49388 2.49413 l 0.0324,-5.08426 0.0327,-5.08427 2.52653,-0.0336 2.52654,-0.0339 v 7.67586 7.67563 h 2.55773 2.55773 v -10.16854 -10.16853 h -2.55773 -2.55773 v -2.55773 -2.55773 h -2.55773 -2.55773 v -2.49535 -2.49534 h -2.49535 -2.49534 v -2.55773 -2.55773 h -2.55773 -2.55773 v 2.55773 2.55773 h -7.6108 -7.610809 v -2.55773 -2.55773 h -2.55774 z m 25.452519,0 h 2.49535 2.49535 v -2.55773 -2.55773 h -2.49535 -2.49535 v 2.55773 z m -25.452519,10.10615 h 5.11546 5.115459 v 2.55773 2.55773 h -5.115459 -5.11546 v -2.55773 z m 15.221609,0 h 5.11546 5.11545 v 2.55773 2.55773 h -5.11545 -5.11546 v -2.55773 z"
+       id="path4732"
+       inkscape:connector-curvature="0" />
+  </g>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     style="display:inline"
+     transform="translate(-61.112559,-78.160466)">
+    <g
+       transform="translate(-131.49557,42.495842)"
+       style="fill:#007200;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="g4628">
+      <path
+         id="path4529"
+         d="m 239.09034,36.164616 -45.98169,26.547215 45.98169,26.547217 45.98117,-26.547217 z"
+         style="opacity:1;fill:#4aba19;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         inkscape:connector-curvature="0" />
+      <path
+         id="path4531"
+         d="m 285.07151,62.711828 -45.98117,26.54722 v 53.094432 l 45.98117,-26.54722 z"
+         style="opacity:1;fill:#007900;fill-opacity:0.34351148;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         inkscape:connector-curvature="0" />
+      <path
+         inkscape:connector-curvature="0"
+         style="opacity:1;fill:#003d00;fill-opacity:0.34509804;stroke:#00a500;stroke-width:1;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         d="m 193.10865,62.711831 45.98117,26.54722 v 53.094429 l -45.98117,-26.54722 z"
+         id="path4541" />
+    </g>
+  </g>
+</svg>
diff --git a/g3doc/user_guide/filesystem.md b/g3doc/user_guide/filesystem.md
index 6c69f42a1..cd00762dd 100644
--- a/g3doc/user_guide/filesystem.md
+++ b/g3doc/user_guide/filesystem.md
@@ -4,8 +4,8 @@
 
 gVisor accesses the filesystem through a file proxy, called the Gofer. The gofer
 runs as a separate process, that is isolated from the sandbox. Gofer instances
-communicate with their respective sentry using the 9P protocol. For a more
-detailed explanation see [Overview > Gofer](../../architecture_guide/#gofer).
+communicate with their respective sentry using the 9P protocol. For another
+explanation see [What is gVisor?](../README.md).
 
 ## Sandbox overlay
 
diff --git a/g3doc/user_guide/install.md b/g3doc/user_guide/install.md
index 0de2b9932..9afdd264d 100644
--- a/g3doc/user_guide/install.md
+++ b/g3doc/user_guide/install.md
@@ -150,11 +150,8 @@ users, and ensure it is executable by all users**, since `runsc` executes itself
 as user `nobody` to avoid unnecessary privileges. The `/usr/local/bin` directory
 is a good place to put the `runsc` binary.
 
-After installation, the`runsc` binary comes with an `install` command that can
-optionally automatically configure Docker:
-
-```bash
-runsc install
-```
+After installation, try out `runsc` by following the
+[Docker Quick Start](./quick_start/docker.md) or
+[OCI Quick Start](./quick_start/oci.md).
 
 [releases]: https://github.com/google/gvisor/releases
diff --git a/g3doc/user_guide/platforms.md b/g3doc/user_guide/platforms.md
index eefb6b222..752025881 100644
--- a/g3doc/user_guide/platforms.md
+++ b/g3doc/user_guide/platforms.md
@@ -1,56 +1,27 @@
-# Platforms (KVM)
+# Changing Platforms
 
 [TOC]
 
-This document will help you set up your system to use a different gVisor
-platform.
+This guide described how to change the
+[platform](../architecture_guide/platforms.md) used by `runsc`.
 
-## What is a Platform?
+## Prerequisites
 
-gVisor requires a *platform* to implement interception of syscalls, basic
-context switching, and memory mapping functionality. These are described in more
-depth in the [Platform Design](../../architecture_guide/platforms/).
+If you intend to run the KVM platform, you will also to have KVM installed on
+your system. If you are running a Debian based system like Debian or Ubuntu you
+can usually do this by ensuring the module is loaded, and permissions are
+appropriately set on the `/dev/kvm` device.
 
-## Selecting a Platform
-
-The platform is selected by the `--platform` command line flag passed to
-`runsc`. By default, the ptrace platform is selected. To select a different
-platform, modify your Docker configuration (`/etc/docker/daemon.json`) to pass
-this argument:
-
-```json
-{
-    "runtimes": {
-        "runsc": {
-            "path": "/usr/local/bin/runsc",
-            "runtimeArgs": [
-                "--platform=kvm"
-            ]
-       }
-    }
-}
-```
-
-You must restart the Docker daemon after making changes to this file, typically
-this is done via `systemd`:
+If you have an Intel CPU:
 
 ```bash
-sudo systemctl restart docker
+sudo modprobe kvm-intel && sudo chmod a+rw /dev/kvm
 ```
 
-## Example: Using the KVM Platform
-
-The KVM platform is currently experimental; however, it provides several
-benefits over the default ptrace platform.
-
-### Prerequisites
-
-You will also to have KVM installed on your system. If you are running a Debian
-based system like Debian or Ubuntu you can usually do this by installing the
-`qemu-kvm` package.
+If you have an AMD CPU:
 
 ```bash
-sudo apt-get install qemu-kvm
+sudo modprobe kvm-amd && sudo chmod a+rw /dev/kvm
 ```
 
 If you are using a virtual machine you will need to make sure that nested
@@ -68,31 +39,22 @@ cause of security issues (e.g.
 [CVE-2018-12904](https://nvd.nist.gov/vuln/detail/CVE-2018-12904)). It is not
 recommended for production.***
 
-### Configuring Docker
-
-Per above, you will need to configure Docker to use `runsc` with the KVM
-platform. You will remember from the Docker Quick Start that you configured
-Docker to use `runsc` as the runtime. Docker allows you to add multiple runtimes
-to the Docker configuration.
+## Configuring Docker
 
-Add a new entry for the KVM platform entry to your Docker configuration
-(`/etc/docker/daemon.json`) in order to provide the `--platform=kvm` runtime
-argument.
-
-In the end, the file should look something like:
+The platform is selected by the `--platform` command line flag passed to
+`runsc`. By default, the ptrace platform is selected. For example, to select the
+KVM platform, modify your Docker configuration (`/etc/docker/daemon.json`) to
+pass the `--platform` argument:
 
 ```json
 {
     "runtimes": {
         "runsc": {
-            "path": "/usr/local/bin/runsc"
-        },
-        "runsc-kvm": {
             "path": "/usr/local/bin/runsc",
             "runtimeArgs": [
                 "--platform=kvm"
             ]
-        }
+       }
     }
 }
 ```
@@ -104,13 +66,27 @@ this is done via `systemd`:
 sudo systemctl restart docker
 ```
 
-## Running a container
+Note that you may configure multiple runtimes using different platforms. For
+example, the following configuration has one configuration for ptrace and one
+for the KVM platform:
 
-Now run your container using the `runsc-kvm` runtime. This will run the
-container using the KVM platform:
-
-```bash
-docker run --runtime=runsc-kvm --rm hello-world
+```json
+{
+    "runtimes": {
+        "runsc-ptrace": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--platform=ptrace"
+            ]
+        },
+        "runsc-kvm": {
+            "path": "/usr/local/bin/runsc",
+            "runtimeArgs": [
+                "--platform=kvm"
+            ]
+        }
+    }
+}
 ```
 
 [nested-azure]: https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nested-virtualization
diff --git a/g3doc/user_guide/quick_start/docker.md b/g3doc/user_guide/quick_start/docker.md
index 5228db4c0..6ad594ecc 100644
--- a/g3doc/user_guide/quick_start/docker.md
+++ b/g3doc/user_guide/quick_start/docker.md
@@ -1,4 +1,4 @@
-# Docker
+# Docker Quick Start
 
 > Note: This guide requires Docker version 17.09.0 or greater. Refer to the
 > [Docker documentation][docker] for how to install it.
@@ -14,24 +14,28 @@ the next section and proceed straight to running a container.
 ## Configuring Docker
 
 First you will need to configure Docker to use `runsc` by adding a runtime entry
-to your Docker configuration (`/etc/docker/daemon.json`). You may have to create
-this file if it does not exist. Also, some Docker versions also require you to
-[specify the `storage-driver` field][storage-driver].
-
-In the end, the file should look something like:
-
-```json
-{
-    "runtimes": {
-        "runsc": {
-            "path": "/usr/local/bin/runsc"
-        }
-    }
-}
+to your Docker configuration (e.g. `/etc/docker/daemon.json`). The easiest way
+to this is via the `runsc install` command. This will install a docker runtime
+named "runsc" by default.
+
+```bash
+sudo runsc install
+```
+
+You may also wish to install a runtime entry for debugging. The `runsc install`
+command can accept options that will be passed to the runtime when it is invoked
+by Docker.
+
+```bash
+sudo runsc install --runtime runsc-debug -- \
+  --debug \
+  --debug-log=/tmp/runsc-debug.log \
+  --strace \
+  --log-packets
 ```
 
-You must restart the Docker daemon after making changes to this file, typically
-this is done via `systemd`:
+You must restart the Docker daemon after installing the runtime. Typically this
+is done via `systemd`:
 
 ```bash
 sudo systemctl restart docker
diff --git a/g3doc/user_guide/quick_start/kubernetes.md b/g3doc/user_guide/quick_start/kubernetes.md
index b1f67252e..f875d8002 100644
--- a/g3doc/user_guide/quick_start/kubernetes.md
+++ b/g3doc/user_guide/quick_start/kubernetes.md
@@ -1,4 +1,4 @@
-# Kubernetes
+# Kubernetes Quick Start
 
 gVisor can be used to run Kubernetes pods and has several integration points
 with Kubernetes.
diff --git a/g3doc/user_guide/quick_start/oci.md b/g3doc/user_guide/quick_start/oci.md
index 57bcc4f63..877169145 100644
--- a/g3doc/user_guide/quick_start/oci.md
+++ b/g3doc/user_guide/quick_start/oci.md
@@ -1,4 +1,4 @@
-# OCI
+# OCI Quick Start
 
 This guide will quickly get you started running your first gVisor sandbox
 container using the runtime directly with the default platform.
diff --git a/g3doc/user_guide/tutorials/docker.md b/g3doc/user_guide/tutorials/docker.md
index c0a3db506..705560038 100644
--- a/g3doc/user_guide/tutorials/docker.md
+++ b/g3doc/user_guide/tutorials/docker.md
@@ -1,4 +1,4 @@
-# WorkPress with Docker
+# WordPress with Docker
 
 This page shows you how to deploy a sample [WordPress][wordpress] site using
 [Docker][docker].
diff --git a/images/jekyll/Dockerfile b/images/jekyll/Dockerfile
index cefd949a6..4860dd750 100644
--- a/images/jekyll/Dockerfile
+++ b/images/jekyll/Dockerfile
@@ -8,5 +8,6 @@ RUN gem install \
         jekyll-paginate:1.1.0 \
         kramdown-parser-gfm:1.1.0 \
         jekyll-relative-links:0.6.1 \
-        jekyll-feed:0.13.0
+        jekyll-feed:0.13.0 \
+        jekyll-sitemap:1.4.0
 CMD ["/usr/gem/gems/jekyll-4.0.0/exe/jekyll", "build", "-t", "-s", "/input", "-d", "/output"]
diff --git a/images/packetdrill/Dockerfile b/images/packetdrill/Dockerfile
index 7a006c85f..01296dbaf 100644
--- a/images/packetdrill/Dockerfile
+++ b/images/packetdrill/Dockerfile
@@ -2,7 +2,7 @@ FROM ubuntu:bionic
 RUN apt-get update && apt-get install -y net-tools git iptables iputils-ping \
         netcat tcpdump jq tar bison flex make
 RUN hash -r
-RUN git clone --branch packetdrill-v2.0 \
+RUN git clone --depth 1 --branch packetdrill-v2.0 \
         https://github.com/google/packetdrill.git
 RUN cd packetdrill/gtests/net/packetdrill && ./configure && make
 CMD /bin/bash
diff --git a/images/tmpfile/Dockerfile b/images/tmpfile/Dockerfile
new file mode 100644
index 000000000..e3816c8cb
--- /dev/null
+++ b/images/tmpfile/Dockerfile
@@ -0,0 +1,4 @@
+# Create file under /tmp to ensure files inside '/tmp' are not overridden.
+FROM alpine:3.11.5
+RUN mkdir -p /tmp/foo \
+  && echo 123 > /tmp/foo/file.txt
diff --git a/pkg/abi/linux/ip.go b/pkg/abi/linux/ip.go
index 31e56ffa6..ef6d1093e 100644
--- a/pkg/abi/linux/ip.go
+++ b/pkg/abi/linux/ip.go
@@ -92,6 +92,16 @@ const (
 	IP_UNICAST_IF             = 50
 )
 
+// IP_MTU_DISCOVER values from uapi/linux/in.h
+const (
+	IP_PMTUDISC_DONT      = 0
+	IP_PMTUDISC_WANT      = 1
+	IP_PMTUDISC_DO        = 2
+	IP_PMTUDISC_PROBE     = 3
+	IP_PMTUDISC_INTERFACE = 4
+	IP_PMTUDISC_OMIT      = 5
+)
+
 // Socket options from uapi/linux/in6.h
 const (
 	IPV6_ADDRFORM         = 1
diff --git a/pkg/abi/linux/tcp.go b/pkg/abi/linux/tcp.go
index 174d470e2..2a8d4708b 100644
--- a/pkg/abi/linux/tcp.go
+++ b/pkg/abi/linux/tcp.go
@@ -57,4 +57,5 @@ const (
 const (
 	MAX_TCP_KEEPIDLE  = 32767
 	MAX_TCP_KEEPINTVL = 32767
+	MAX_TCP_KEEPCNT   = 127
 )
diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go
index 0e5b86344..b789e56e9 100644
--- a/pkg/buffer/safemem.go
+++ b/pkg/buffer/safemem.go
@@ -28,12 +28,11 @@ func (b *buffer) ReadBlock() safemem.Block {
 	return safemem.BlockFromSafeSlice(b.ReadSlice())
 }
 
-// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
-//
-// This will advance the write index.
-func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
-	need := int(srcs.NumBytes())
-	if need == 0 {
+// WriteFromSafememReader writes up to count bytes from r to v and advances the
+// write index by the number of bytes written. It calls r.ReadToBlocks() at
+// most once.
+func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) {
+	if count == 0 {
 		return 0, nil
 	}
 
@@ -50,32 +49,33 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
 	}
 
 	// Does the last block have sufficient capacity alone?
-	if l := firstBuf.WriteSize(); l >= need {
-		dst = safemem.BlockSeqOf(firstBuf.WriteBlock())
+	if l := uint64(firstBuf.WriteSize()); l >= count {
+		dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count))
 	} else {
 		// Append blocks until sufficient.
-		need -= l
+		count -= l
 		blocks = append(blocks, firstBuf.WriteBlock())
-		for need > 0 {
+		for count > 0 {
 			emptyBuf := bufferPool.Get().(*buffer)
 			v.data.PushBack(emptyBuf)
-			need -= emptyBuf.WriteSize()
-			blocks = append(blocks, emptyBuf.WriteBlock())
+			block := emptyBuf.WriteBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
 		}
 		dst = safemem.BlockSeqFromSlice(blocks)
 	}
 
-	// Perform the copy.
-	n, err := safemem.CopySeq(dst, srcs)
+	// Perform I/O.
+	n, err := r.ReadToBlocks(dst)
 	v.size += int64(n)
 
 	// Update all indices.
-	for left := int(n); left > 0; firstBuf = firstBuf.Next() {
-		if l := firstBuf.WriteSize(); left >= l {
+	for left := n; left > 0; firstBuf = firstBuf.Next() {
+		if l := firstBuf.WriteSize(); left >= uint64(l) {
 			firstBuf.WriteMove(l) // Whole block.
-			left -= l
+			left -= uint64(l)
 		} else {
-			firstBuf.WriteMove(left) // Partial block.
+			firstBuf.WriteMove(int(left)) // Partial block.
 			left = 0
 		}
 	}
@@ -83,14 +83,16 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
 	return n, err
 }
 
-// ReadToBlocks implements safemem.Reader.ReadToBlocks.
-//
-// This will not advance the read index; the caller should follow
-// this call with a call to TrimFront in order to remove the read
-// data from the buffer. This is done to support pipe sematics.
-func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
-	need := int(dsts.NumBytes())
-	if need == 0 {
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the
+// write index by the number of bytes written.
+func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes())
+}
+
+// ReadToSafememWriter reads up to count bytes from v to w. It does not advance
+// the read index. It calls w.WriteFromBlocks() at most once.
+func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) {
+	if count == 0 {
 		return 0, nil
 	}
 
@@ -105,25 +107,27 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
 	}
 
 	// Is all the data in a single block?
-	if l := firstBuf.ReadSize(); l >= need {
-		src = safemem.BlockSeqOf(firstBuf.ReadBlock())
+	if l := uint64(firstBuf.ReadSize()); l >= count {
+		src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count))
 	} else {
 		// Build a list of all the buffers.
-		need -= l
+		count -= l
 		blocks = append(blocks, firstBuf.ReadBlock())
-		for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() {
-			need -= buf.ReadSize()
-			blocks = append(blocks, buf.ReadBlock())
+		for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() {
+			block := buf.ReadBlock().TakeFirst64(count)
+			count -= uint64(block.Len())
+			blocks = append(blocks, block)
 		}
 		src = safemem.BlockSeqFromSlice(blocks)
 	}
 
-	// Perform the copy.
-	n, err := safemem.CopySeq(dsts, src)
-
-	// See above: we would normally advance the read index here, but we
-	// don't do that in order to support pipe semantics. We rely on a
-	// separate call to TrimFront() in this case.
+	// Perform I/O. As documented, we don't advance the read index.
+	return w.WriteFromBlocks(src)
+}
 
-	return n, err
+// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the
+// read index by the number of bytes read, such that it's only safe to call if
+// the caller guarantees that ReadToBlocks will only be called once.
+func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes())
 }
diff --git a/pkg/cleanup/BUILD b/pkg/cleanup/BUILD
new file mode 100644
index 000000000..5c34b9872
--- /dev/null
+++ b/pkg/cleanup/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "cleanup",
+    srcs = ["cleanup.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+    ],
+)
+
+go_test(
+    name = "cleanup_test",
+    srcs = ["cleanup_test.go"],
+    library = ":cleanup",
+)
diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go
new file mode 100644
index 000000000..14a05f076
--- /dev/null
+++ b/pkg/cleanup/cleanup.go
@@ -0,0 +1,60 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package cleanup provides utilities to clean "stuff" on defers.
+package cleanup
+
+// Cleanup allows defers to be aborted when cleanup needs to happen
+// conditionally. Usage:
+// 	 cu := cleanup.Make(func() { f.Close() })
+// 	 defer cu.Clean() // failure before release is called will close the file.
+// 	 ...
+//   cu.Add(func() { f2.Close() })  // Adds another cleanup function
+//   ...
+// 	 cu.Release() // on success, aborts closing the file.
+// 	 return f
+type Cleanup struct {
+	cleaners []func()
+}
+
+// Make creates a new Cleanup object.
+func Make(f func()) Cleanup {
+	return Cleanup{cleaners: []func(){f}}
+}
+
+// Add adds a new function to be called on Clean().
+func (c *Cleanup) Add(f func()) {
+	c.cleaners = append(c.cleaners, f)
+}
+
+// Clean calls all cleanup functions in reverse order.
+func (c *Cleanup) Clean() {
+	clean(c.cleaners)
+	c.cleaners = nil
+}
+
+// Release releases the cleanup from its duties, i.e. cleanup functions are not
+// called after this point. Returns a function that calls all registered
+// functions in case the caller has use for them.
+func (c *Cleanup) Release() func() {
+	old := c.cleaners
+	c.cleaners = nil
+	return func() { clean(old) }
+}
+
+func clean(cleaners []func()) {
+	for i := len(cleaners) - 1; i >= 0; i-- {
+		cleaners[i]()
+	}
+}
diff --git a/pkg/cleanup/cleanup_test.go b/pkg/cleanup/cleanup_test.go
new file mode 100644
index 000000000..ab3d9ed95
--- /dev/null
+++ b/pkg/cleanup/cleanup_test.go
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cleanup
+
+import "testing"
+
+func testCleanupHelper(clean, cleanAdd *bool, release bool) func() {
+	cu := Make(func() {
+		*clean = true
+	})
+	cu.Add(func() {
+		*cleanAdd = true
+	})
+	defer cu.Clean()
+	if release {
+		return cu.Release()
+	}
+	return nil
+}
+
+func TestCleanup(t *testing.T) {
+	clean := false
+	cleanAdd := false
+	testCleanupHelper(&clean, &cleanAdd, false)
+	if !clean {
+		t.Fatalf("cleanup function was not called.")
+	}
+	if !cleanAdd {
+		t.Fatalf("added cleanup function was not called.")
+	}
+}
+
+func TestRelease(t *testing.T) {
+	clean := false
+	cleanAdd := false
+	cleaner := testCleanupHelper(&clean, &cleanAdd, true)
+
+	// Check that clean was not called after release.
+	if clean {
+		t.Fatalf("cleanup function was called.")
+	}
+	if cleanAdd {
+		t.Fatalf("added cleanup function was called.")
+	}
+
+	// Call the cleaner function and check that both cleanup functions are called.
+	cleaner()
+	if !clean {
+		t.Fatalf("cleanup function was not called.")
+	}
+	if !cleanAdd {
+		t.Fatalf("added cleanup function was not called.")
+	}
+}
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index 9c5ad500b..aa8e4e1f3 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -11,6 +11,7 @@ go_library(
         "futex_linux.go",
         "io.go",
         "packet_window_allocator.go",
+        "packet_window_mmap.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/pkg/flipcall/flipcall.go b/pkg/flipcall/flipcall.go
index 3cdb576e1..ec742c091 100644
--- a/pkg/flipcall/flipcall.go
+++ b/pkg/flipcall/flipcall.go
@@ -95,7 +95,7 @@ func (ep *Endpoint) Init(side EndpointSide, pwd PacketWindowDescriptor, opts ...
 	if pwd.Length > math.MaxUint32 {
 		return fmt.Errorf("packet window size (%d) exceeds maximum (%d)", pwd.Length, math.MaxUint32)
 	}
-	m, _, e := syscall.RawSyscall6(syscall.SYS_MMAP, 0, uintptr(pwd.Length), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset))
+	m, e := packetWindowMmap(pwd)
 	if e != 0 {
 		return fmt.Errorf("failed to mmap packet window: %v", e)
 	}
diff --git a/pkg/flipcall/packet_window_mmap.go b/pkg/flipcall/packet_window_mmap.go
new file mode 100644
index 000000000..869183b11
--- /dev/null
+++ b/pkg/flipcall/packet_window_mmap.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package flipcall
+
+import (
+	"syscall"
+)
+
+// Return a memory mapping of the pwd in memory that can be shared outside the sandbox.
+func packetWindowMmap(pwd PacketWindowDescriptor) (uintptr, syscall.Errno) {
+	m, _, err := syscall.RawSyscall6(syscall.SYS_MMAP, 0, uintptr(pwd.Length), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset))
+	return m, err
+}
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
index ea8d2422c..7a82631c5 100644
--- a/pkg/goid/BUILD
+++ b/pkg/goid/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "goid.go",
         "goid_amd64.s",
+        "goid_arm64.s",
         "goid_race.go",
         "goid_unsafe.go",
     ],
diff --git a/pkg/goid/goid_arm64.s b/pkg/goid/goid_arm64.s
new file mode 100644
index 000000000..a7465b75d
--- /dev/null
+++ b/pkg/goid/goid_arm64.s
@@ -0,0 +1,21 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// func getg() *g
+TEXT ·getg(SB),NOSPLIT,$0-8
+        MOVD g, R0      // g
+        MOVD R0, ret+0(FP)
+        RET
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index 41bf104d0..f84d03700 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -5,6 +5,8 @@ package(licenses = ["notice"])
 go_library(
     name = "linewriter",
     srcs = ["linewriter.go"],
+    marshal = False,
+    stateify = False,
     visibility = ["//visibility:public"],
     deps = ["//pkg/sync"],
 )
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index a7c8f7bef..3ed6aba5c 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -10,6 +10,8 @@ go_library(
         "json_k8s.go",
         "log.go",
     ],
+    marshal = False,
+    stateify = False,
     visibility = [
         "//visibility:public",
     ],
diff --git a/pkg/merkletree/BUILD b/pkg/merkletree/BUILD
new file mode 100644
index 000000000..5b0e4143a
--- /dev/null
+++ b/pkg/merkletree/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "merkletree",
+    srcs = ["merkletree.go"],
+    deps = ["//pkg/usermem"],
+)
+
+go_test(
+    name = "merkletree_test",
+    srcs = ["merkletree_test.go"],
+    library = ":merkletree",
+    deps = ["//pkg/usermem"],
+)
diff --git a/pkg/merkletree/merkletree.go b/pkg/merkletree/merkletree.go
new file mode 100644
index 000000000..906f67943
--- /dev/null
+++ b/pkg/merkletree/merkletree.go
@@ -0,0 +1,135 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package merkletree implements Merkle tree generating and verification.
+package merkletree
+
+import (
+	"crypto/sha256"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const (
+	// sha256DigestSize specifies the digest size of a SHA256 hash.
+	sha256DigestSize = 32
+)
+
+// Size defines the scale of a Merkle tree.
+type Size struct {
+	// blockSize is the size of a data block to be hashed.
+	blockSize int64
+	// digestSize is the size of a generated hash.
+	digestSize int64
+	// hashesPerBlock is the number of hashes in a block. For example, if
+	// blockSize is 4096 bytes, and digestSize is 32 bytes, there will be 128
+	// hashesPerBlock. Therefore 128 hashes in a lower level will be put into a
+	// block and generate a single hash in an upper level.
+	hashesPerBlock int64
+	// levelStart is the start block index of each level. The number of levels in
+	// the tree is the length of the slice. The leafs (level 0) are hashes of
+	// blocks in the input data. The levels above are hashes of lower level
+	// hashes.  The highest level is the root hash.
+	levelStart []int64
+}
+
+// MakeSize initializes and returns a new Size object describing the structure
+// of a tree. dataSize specifies the number of the file system size in bytes.
+func MakeSize(dataSize int64) Size {
+	size := Size{
+		blockSize: usermem.PageSize,
+		// TODO(b/156980949): Allow config other hash methods (SHA384/SHA512).
+		digestSize:     sha256DigestSize,
+		hashesPerBlock: usermem.PageSize / sha256DigestSize,
+	}
+	numBlocks := (dataSize + size.blockSize - 1) / size.blockSize
+	level := int64(0)
+	offset := int64(0)
+
+	// Calcuate the number of levels in the Merkle tree and the beginning offset
+	// of each level. Level 0 is the level directly above the data blocks, while
+	// level NumLevels - 1 is the root.
+	for numBlocks > 1 {
+		size.levelStart = append(size.levelStart, offset)
+		// Round numBlocks up to fill up a block.
+		numBlocks += (size.hashesPerBlock - numBlocks%size.hashesPerBlock) % size.hashesPerBlock
+		offset += numBlocks / size.hashesPerBlock
+		numBlocks = numBlocks / size.hashesPerBlock
+		level++
+	}
+	size.levelStart = append(size.levelStart, offset)
+	return size
+}
+
+// Generate constructs a Merkle tree for the contents of data. The output is
+// written to treeWriter. The treeReader should be able to read the tree after
+// it has been written. That is, treeWriter and treeReader should point to the
+// same underlying data but have separate cursors.
+func Generate(data io.Reader, dataSize int64, treeReader io.Reader, treeWriter io.Writer) ([]byte, error) {
+	size := MakeSize(dataSize)
+
+	numBlocks := (dataSize + size.blockSize - 1) / size.blockSize
+
+	var root []byte
+	for level := 0; level < len(size.levelStart); level++ {
+		for i := int64(0); i < numBlocks; i++ {
+			buf := make([]byte, size.blockSize)
+			var (
+				n   int
+				err error
+			)
+			if level == 0 {
+				// Read data block from the target file since level 0 is directly above
+				// the raw data block.
+				n, err = data.Read(buf)
+			} else {
+				// Read data block from the tree file since levels higher than 0 are
+				// hashing the lower level hashes.
+				n, err = treeReader.Read(buf)
+			}
+
+			// err is populated as long as the bytes read is smaller than the buffer
+			// size. This could be the case if we are reading the last block, and
+			// break in that case. If this is the last block, the end of the block
+			// will be zero-padded.
+			if n == 0 && err == io.EOF {
+				break
+			} else if err != nil && err != io.EOF {
+				return nil, err
+			}
+			// Hash the bytes in buf.
+			digest := sha256.Sum256(buf)
+
+			if level == len(size.levelStart)-1 {
+				root = digest[:]
+			}
+
+			// Write the generated hash to the end of the tree file.
+			if _, err = treeWriter.Write(digest[:]); err != nil {
+				return nil, err
+			}
+		}
+		// If the genereated digests do not round up to a block, zero-padding the
+		// remaining of the last block. But no need to do so for root.
+		if level != len(size.levelStart)-1 && numBlocks%size.hashesPerBlock != 0 {
+			zeroBuf := make([]byte, size.blockSize-(numBlocks%size.hashesPerBlock)*size.digestSize)
+			if _, err := treeWriter.Write(zeroBuf[:]); err != nil {
+				return nil, err
+			}
+		}
+		numBlocks = (numBlocks + size.hashesPerBlock - 1) / size.hashesPerBlock
+	}
+	return root, nil
+}
diff --git a/pkg/merkletree/merkletree_test.go b/pkg/merkletree/merkletree_test.go
new file mode 100644
index 000000000..7344db0b6
--- /dev/null
+++ b/pkg/merkletree/merkletree_test.go
@@ -0,0 +1,122 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package merkletree
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func TestSize(t *testing.T) {
+	testCases := []struct {
+		dataSize           int64
+		expectedLevelStart []int64
+	}{
+		{
+			dataSize:           100,
+			expectedLevelStart: []int64{0},
+		},
+		{
+			dataSize:           1000000,
+			expectedLevelStart: []int64{0, 2, 3},
+		},
+		{
+			dataSize:           4096 * int64(usermem.PageSize),
+			expectedLevelStart: []int64{0, 32, 33},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
+			s := MakeSize(tc.dataSize)
+			if s.blockSize != int64(usermem.PageSize) {
+				t.Errorf("got blockSize %d, want %d", s.blockSize, usermem.PageSize)
+			}
+			if s.digestSize != sha256DigestSize {
+				t.Errorf("got digestSize %d, want %d", s.digestSize, sha256DigestSize)
+			}
+			if len(s.levelStart) != len(tc.expectedLevelStart) {
+				t.Errorf("got levels %d, want %d", len(s.levelStart), len(tc.expectedLevelStart))
+			}
+			for i := 0; i < len(s.levelStart) && i < len(tc.expectedLevelStart); i++ {
+				if s.levelStart[i] != tc.expectedLevelStart[i] {
+					t.Errorf("got levelStart[%d] %d, want %d", i, s.levelStart[i], tc.expectedLevelStart[i])
+				}
+			}
+		})
+	}
+}
+
+func TestGenerate(t *testing.T) {
+	// The input data has size dataSize. It starts with the data in startWith,
+	// and all other bytes are zeroes.
+	testCases := []struct {
+		dataSize     int
+		startWith    []byte
+		expectedRoot []byte
+	}{
+		{
+			dataSize:     usermem.PageSize,
+			startWith:    nil,
+			expectedRoot: []byte{173, 127, 172, 178, 88, 111, 198, 233, 102, 192, 4, 215, 209, 209, 107, 2, 79, 88, 5, 255, 124, 180, 124, 122, 133, 218, 189, 139, 72, 137, 44, 167},
+		},
+		{
+			dataSize:     128*usermem.PageSize + 1,
+			startWith:    nil,
+			expectedRoot: []byte{62, 93, 40, 92, 161, 241, 30, 223, 202, 99, 39, 2, 132, 113, 240, 139, 117, 99, 79, 243, 54, 18, 100, 184, 141, 121, 238, 46, 149, 202, 203, 132},
+		},
+		{
+			dataSize:     1,
+			startWith:    []byte{'a'},
+			expectedRoot: []byte{52, 75, 204, 142, 172, 129, 37, 14, 145, 137, 103, 203, 11, 162, 209, 205, 30, 169, 213, 72, 20, 28, 243, 24, 242, 2, 92, 43, 169, 59, 110, 210},
+		},
+		{
+			dataSize:     1,
+			startWith:    []byte{'1'},
+			expectedRoot: []byte{74, 35, 103, 179, 176, 149, 254, 112, 42, 65, 104, 66, 119, 56, 133, 124, 228, 15, 65, 161, 150, 0, 117, 174, 242, 34, 115, 115, 218, 37, 3, 105},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("%d", tc.dataSize), func(t *testing.T) {
+			var (
+				data bytes.Buffer
+				tree bytes.Buffer
+			)
+
+			startSize := len(tc.startWith)
+			_, err := data.Write(tc.startWith)
+			if err != nil {
+				t.Fatalf("Failed to write to data: %v", err)
+			}
+			_, err = data.Write(make([]byte, tc.dataSize-startSize))
+			if err != nil {
+				t.Fatalf("Failed to write to data: %v", err)
+			}
+
+			root, err := Generate(&data, int64(tc.dataSize), &tree, &tree)
+			if err != nil {
+				t.Fatalf("Generate failed: %v", err)
+			}
+
+			if !bytes.Equal(root, tc.expectedRoot) {
+				t.Errorf("Unexpected root")
+			}
+		})
+	}
+}
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index fdfa83648..60cf94fa1 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -482,10 +482,10 @@ func (cs *connState) handle(m message) (r message) {
 	defer func() {
 		if r == nil {
 			// Don't allow a panic to propagate.
-			recover()
+			err := recover()
 
 			// Include a useful log message.
-			log.Warningf("panic in handler: %s", debug.Stack())
+			log.Warningf("panic in handler: %v\n%s", err, debug.Stack())
 
 			// Wrap in an EFAULT error; we don't really have a
 			// better way to describe this kind of error. It will
diff --git a/pkg/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 38cea9be3..7c622e5d7 100644
--- a/pkg/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
@@ -14,7 +14,7 @@
 
 // +build amd64
 // +build go1.8
-// +build !go1.15
+// +build !go1.16
 
 #include "textflag.h"
 
diff --git a/pkg/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index 4f4b70fef..48ebb5fd1 100644
--- a/pkg/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
@@ -14,7 +14,7 @@
 
 // +build arm64
 // +build go1.8
-// +build !go1.15
+// +build !go1.16
 
 #include "textflag.h"
 
diff --git a/pkg/segment/BUILD b/pkg/segment/BUILD
index 1b487b887..f57ccc170 100644
--- a/pkg/segment/BUILD
+++ b/pkg/segment/BUILD
@@ -21,6 +21,8 @@ go_template(
     ],
     opt_consts = [
         "minDegree",
+        # trackGaps must either be 0 or 1.
+        "trackGaps",
     ],
     types = [
         "Key",
diff --git a/pkg/segment/set.go b/pkg/segment/set.go
index 03e4f258f..1a17ad9cb 100644
--- a/pkg/segment/set.go
+++ b/pkg/segment/set.go
@@ -36,6 +36,34 @@ type Range interface{}
 // Value is a required type parameter.
 type Value interface{}
 
+// trackGaps is an optional parameter.
+//
+// If trackGaps is 1, the Set will track maximum gap size recursively,
+// enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this
+// case, Key must be an unsigned integer.
+//
+// trackGaps must be 0 or 1.
+const trackGaps = 0
+
+var _ = uint8(trackGaps << 7) // Will fail if not zero or one.
+
+// dynamicGap is a type that disappears if trackGaps is 0.
+type dynamicGap [trackGaps]Key
+
+// Get returns the value of the gap.
+//
+// Precondition: trackGaps must be non-zero.
+func (d *dynamicGap) Get() Key {
+	return d[:][0]
+}
+
+// Set sets the value of the gap.
+//
+// Precondition: trackGaps must be non-zero.
+func (d *dynamicGap) Set(v Key) {
+	d[:][0] = v
+}
+
 // Functions is a required type parameter that must be a struct implementing
 // the methods defined by Functions.
 type Functions interface {
@@ -327,8 +355,12 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator {
 	}
 	if prev.Ok() && prev.End() == r.Start {
 		if mval, ok := (Functions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+			shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get()
 			prev.SetEndUnchecked(r.End)
 			prev.SetValue(mval)
+			if shrinkMaxGap {
+				gap.node.updateMaxGapLeaf()
+			}
 			if next.Ok() && next.Start() == r.End {
 				val = mval
 				if mval, ok := (Functions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
@@ -342,11 +374,16 @@ func (s *Set) Insert(gap GapIterator, r Range, val Value) Iterator {
 	}
 	if next.Ok() && next.Start() == r.End {
 		if mval, ok := (Functions{}).Merge(r, val, next.Range(), next.Value()); ok {
+			shrinkMaxGap := trackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get()
 			next.SetStartUnchecked(r.Start)
 			next.SetValue(mval)
+			if shrinkMaxGap {
+				gap.node.updateMaxGapLeaf()
+			}
 			return next
 		}
 	}
+	// InsertWithoutMergingUnchecked will maintain maxGap if necessary.
 	return s.InsertWithoutMergingUnchecked(gap, r, val)
 }
 
@@ -373,11 +410,15 @@ func (s *Set) InsertWithoutMerging(gap GapIterator, r Range, val Value) Iterator
 // Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
 func (s *Set) InsertWithoutMergingUnchecked(gap GapIterator, r Range, val Value) Iterator {
 	gap = gap.node.rebalanceBeforeInsert(gap)
+	splitMaxGap := trackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get())
 	copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
 	copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
 	gap.node.keys[gap.index] = r
 	gap.node.values[gap.index] = val
 	gap.node.nrSegments++
+	if splitMaxGap {
+		gap.node.updateMaxGapLeaf()
+	}
 	return Iterator{gap.node, gap.index}
 }
 
@@ -399,12 +440,23 @@ func (s *Set) Remove(seg Iterator) GapIterator {
 		// overlap.
 		seg.SetRangeUnchecked(victim.Range())
 		seg.SetValue(victim.Value())
+		// Need to update the nextAdjacentNode's maxGap because the gap in between
+		// must have been modified by updating seg.Range() to victim.Range().
+		// seg.NextSegment() must exist since the last segment can't be in a
+		// non-leaf node.
+		nextAdjacentNode := seg.NextSegment().node
+		if trackGaps != 0 {
+			nextAdjacentNode.updateMaxGapLeaf()
+		}
 		return s.Remove(victim).NextGap()
 	}
 	copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
 	copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
 	Functions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
 	seg.node.nrSegments--
+	if trackGaps != 0 {
+		seg.node.updateMaxGapLeaf()
+	}
 	return seg.node.rebalanceAfterRemove(GapIterator{seg.node, seg.index})
 }
 
@@ -455,6 +507,7 @@ func (s *Set) MergeUnchecked(first, second Iterator) Iterator {
 			// overlaps second.
 			first.SetEndUnchecked(second.End())
 			first.SetValue(mval)
+			// Remove will handle the maxGap update if necessary.
 			return s.Remove(second).PrevSegment()
 		}
 	}
@@ -631,6 +684,12 @@ type node struct {
 	// than "isLeaf" because false must be the correct value for an empty root.
 	hasChildren bool
 
+	// The longest gap within this node. If the node is a leaf, it's simply the
+	// maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys
+	// including the 0th and nrSegments-th gap possibly shared with its upper-level
+	// nodes; if it's a non-leaf node, it's the max of all children's maxGap.
+	maxGap dynamicGap
+
 	// Nodes store keys and values in separate arrays to maximize locality in
 	// the common case (scanning keys for lookup).
 	keys     [maxDegree - 1]Range
@@ -676,12 +735,12 @@ func (n *node) nextSibling() *node {
 // required for insertion, and returns an updated iterator to the position
 // represented by gap.
 func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
-	if n.parent != nil {
-		gap = n.parent.rebalanceBeforeInsert(gap)
-	}
 	if n.nrSegments < maxDegree-1 {
 		return gap
 	}
+	if n.parent != nil {
+		gap = n.parent.rebalanceBeforeInsert(gap)
+	}
 	if n.parent == nil {
 		// n is root. Move all segments before and after n's median segment
 		// into new child nodes adjacent to the median segment, which is now
@@ -719,6 +778,13 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
 		n.hasChildren = true
 		n.children[0] = left
 		n.children[1] = right
+		// In this case, n's maxGap won't violated as it's still the root,
+		// but the left and right children should be updated locally as they
+		// are newly split from n.
+		if trackGaps != 0 {
+			left.updateMaxGapLocal()
+			right.updateMaxGapLocal()
+		}
 		if gap.node != n {
 			return gap
 		}
@@ -758,6 +824,12 @@ func (n *node) rebalanceBeforeInsert(gap GapIterator) GapIterator {
 		}
 	}
 	n.nrSegments = minDegree - 1
+	// MaxGap of n's parent is not violated because the segments within is not changed.
+	// n and its sibling's maxGap need to be updated locally as they are two new nodes split from old n.
+	if trackGaps != 0 {
+		n.updateMaxGapLocal()
+		sibling.updateMaxGapLocal()
+	}
 	// gap.node can't be n.parent because gaps are always in leaf nodes.
 	if gap.node != n {
 		return gap
@@ -821,6 +893,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 			}
 			n.nrSegments++
 			sibling.nrSegments--
+			// n's parent's maxGap does not need to be updated as its content is unmodified.
+			// n and its sibling must be updated with (new) maxGap because of the shift of keys.
+			if trackGaps != 0 {
+				n.updateMaxGapLocal()
+				sibling.updateMaxGapLocal()
+			}
 			if gap.node == sibling && gap.index == sibling.nrSegments {
 				return GapIterator{n, 0}
 			}
@@ -849,6 +927,12 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 			}
 			n.nrSegments++
 			sibling.nrSegments--
+			// n's parent's maxGap does not need to be updated as its content is unmodified.
+			// n and its sibling must be updated with (new) maxGap because of the shift of keys.
+			if trackGaps != 0 {
+				n.updateMaxGapLocal()
+				sibling.updateMaxGapLocal()
+			}
 			if gap.node == sibling {
 				if gap.index == 0 {
 					return GapIterator{n, n.nrSegments}
@@ -886,6 +970,7 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 				p.children[0] = nil
 				p.children[1] = nil
 			}
+			// No need to update maxGap of p as its content is not changed.
 			if gap.node == left {
 				return GapIterator{p, gap.index}
 			}
@@ -932,11 +1017,152 @@ func (n *node) rebalanceAfterRemove(gap GapIterator) GapIterator {
 		}
 		p.children[p.nrSegments] = nil
 		p.nrSegments--
+		// Update maxGap of left locally, no need to change p and right because
+		// p's contents is not changed and right is already invalid.
+		if trackGaps != 0 {
+			left.updateMaxGapLocal()
+		}
 		// This process robs p of one segment, so recurse into rebalancing p.
 		n = p
 	}
 }
 
+// updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no
+// necessary update.
+//
+// Preconditions: n must be a leaf node, trackGaps must be 1.
+func (n *node) updateMaxGapLeaf() {
+	if n.hasChildren {
+		panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n))
+	}
+	max := n.calculateMaxGapLeaf()
+	if max == n.maxGap.Get() {
+		// If new max equals the old maxGap, no update is needed.
+		return
+	}
+	oldMax := n.maxGap.Get()
+	n.maxGap.Set(max)
+	if max > oldMax {
+		// Grow ancestor maxGaps.
+		for p := n.parent; p != nil; p = p.parent {
+			if p.maxGap.Get() >= max {
+				// p and its ancestors already contain an equal or larger gap.
+				break
+			}
+			// Only if new maxGap is larger than parent's
+			// old maxGap, propagate this update to parent.
+			p.maxGap.Set(max)
+		}
+		return
+	}
+	// Shrink ancestor maxGaps.
+	for p := n.parent; p != nil; p = p.parent {
+		if p.maxGap.Get() > oldMax {
+			// p and its ancestors still contain a larger gap.
+			break
+		}
+		// If new max is smaller than the old maxGap, and this gap used
+		// to be the maxGap of its parent, iterate parent's children
+		// and calculate parent's new maxGap.(It's probable that parent
+		// has two children with the old maxGap, but we need to check it anyway.)
+		parentNewMax := p.calculateMaxGapInternal()
+		if p.maxGap.Get() == parentNewMax {
+			// p and its ancestors still contain a gap of at least equal size.
+			break
+		}
+		// If p's new maxGap differs from the old one, propagate this update.
+		p.maxGap.Set(parentNewMax)
+	}
+}
+
+// updateMaxGapLocal updates maxGap of the calling node solely with no
+// propagation to ancestor nodes.
+//
+// Precondition: trackGaps must be 1.
+func (n *node) updateMaxGapLocal() {
+	if !n.hasChildren {
+		// Leaf node iterates its gaps.
+		n.maxGap.Set(n.calculateMaxGapLeaf())
+	} else {
+		// Non-leaf node iterates its children.
+		n.maxGap.Set(n.calculateMaxGapInternal())
+	}
+}
+
+// calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the
+// max.
+//
+// Preconditions: n must be a leaf node.
+func (n *node) calculateMaxGapLeaf() Key {
+	max := GapIterator{n, 0}.Range().Length()
+	for i := 1; i <= n.nrSegments; i++ {
+		if current := (GapIterator{n, i}).Range().Length(); current > max {
+			max = current
+		}
+	}
+	return max
+}
+
+// calculateMaxGapInternal iterates children's maxGap within an internal node n
+// and calculate the max.
+//
+// Preconditions: n must be a non-leaf node.
+func (n *node) calculateMaxGapInternal() Key {
+	max := n.children[0].maxGap.Get()
+	for i := 1; i <= n.nrSegments; i++ {
+		if current := n.children[i].maxGap.Get(); current > max {
+			max = current
+		}
+	}
+	return max
+}
+
+// searchFirstLargeEnoughGap returns the first gap having at least minSize length
+// in the subtree rooted by n. If not found, return a terminal gap iterator.
+func (n *node) searchFirstLargeEnoughGap(minSize Key) GapIterator {
+	if n.maxGap.Get() < minSize {
+		return GapIterator{}
+	}
+	if n.hasChildren {
+		for i := 0; i <= n.nrSegments; i++ {
+			if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		}
+	} else {
+		for i := 0; i <= n.nrSegments; i++ {
+			currentGap := GapIterator{n, i}
+			if currentGap.Range().Length() >= minSize {
+				return currentGap
+			}
+		}
+	}
+	panic(fmt.Sprintf("invalid maxGap in %v", n))
+}
+
+// searchLastLargeEnoughGap returns the last gap having at least minSize length
+// in the subtree rooted by n. If not found, return a terminal gap iterator.
+func (n *node) searchLastLargeEnoughGap(minSize Key) GapIterator {
+	if n.maxGap.Get() < minSize {
+		return GapIterator{}
+	}
+	if n.hasChildren {
+		for i := n.nrSegments; i >= 0; i-- {
+			if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		}
+	} else {
+		for i := n.nrSegments; i >= 0; i-- {
+			currentGap := GapIterator{n, i}
+			if currentGap.Range().Length() >= minSize {
+				return currentGap
+			}
+		}
+	}
+	panic(fmt.Sprintf("invalid maxGap in %v", n))
+}
+
 // A Iterator is conceptually one of:
 //
 // - A pointer to a segment in a set; or
@@ -1243,6 +1469,122 @@ func (gap GapIterator) NextGap() GapIterator {
 	return seg.NextGap()
 }
 
+// NextLargeEnoughGap returns the iterated gap's first next gap with larger
+// length than minSize.  If not found, return a terminal gap iterator (does NOT
+// include this gap itself).
+//
+// Precondition: trackGaps must be 1.
+func (gap GapIterator) NextLargeEnoughGap(minSize Key) GapIterator {
+	if trackGaps != 1 {
+		panic("set is not tracking gaps")
+	}
+	if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments {
+		// If gap is the trailing gap of an non-leaf node,
+		// translate it to the equivalent gap on leaf level.
+		gap.node = gap.NextSegment().node
+		gap.index = 0
+		return gap.nextLargeEnoughGapHelper(minSize)
+	}
+	return gap.nextLargeEnoughGapHelper(minSize)
+}
+
+// nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap
+// to do the real recursions.
+//
+// Preconditions: gap is NOT the trailing gap of a non-leaf node.
+func (gap GapIterator) nextLargeEnoughGapHelper(minSize Key) GapIterator {
+	// Crawl up the tree if no large enough gap in current node or the
+	// current gap is the trailing one on leaf level.
+	for gap.node != nil &&
+		(gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) {
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	// If no large enough gap throughout the whole set, return a terminal
+	// gap iterator.
+	if gap.node == nil {
+		return GapIterator{}
+	}
+	// Iterate subsequent gaps.
+	gap.index++
+	for gap.index <= gap.node.nrSegments {
+		if gap.node.hasChildren {
+			if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		} else {
+			if gap.Range().Length() >= minSize {
+				return gap
+			}
+		}
+		gap.index++
+	}
+	gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	if gap.node != nil && gap.index == gap.node.nrSegments {
+		// If gap is the trailing gap of a non-leaf node, crawl up to
+		// parent again and do recursion.
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	return gap.nextLargeEnoughGapHelper(minSize)
+}
+
+// PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or
+// equal length than minSize.  If not found, return a terminal gap iterator
+// (does NOT include this gap itself).
+//
+// Precondition: trackGaps must be 1.
+func (gap GapIterator) PrevLargeEnoughGap(minSize Key) GapIterator {
+	if trackGaps != 1 {
+		panic("set is not tracking gaps")
+	}
+	if gap.node != nil && gap.node.hasChildren && gap.index == 0 {
+		// If gap is the first gap of an non-leaf node,
+		// translate it to the equivalent gap on leaf level.
+		gap.node = gap.PrevSegment().node
+		gap.index = gap.node.nrSegments
+		return gap.prevLargeEnoughGapHelper(minSize)
+	}
+	return gap.prevLargeEnoughGapHelper(minSize)
+}
+
+// prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap
+// to do the real recursions.
+//
+// Preconditions: gap is NOT the first gap of a non-leaf node.
+func (gap GapIterator) prevLargeEnoughGapHelper(minSize Key) GapIterator {
+	// Crawl up the tree if no large enough gap in current node or the
+	// current gap is the first one on leaf level.
+	for gap.node != nil &&
+		(gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) {
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	// If no large enough gap throughout the whole set, return a terminal
+	// gap iterator.
+	if gap.node == nil {
+		return GapIterator{}
+	}
+	// Iterate previous gaps.
+	gap.index--
+	for gap.index >= 0 {
+		if gap.node.hasChildren {
+			if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() {
+				return largeEnoughGap
+			}
+		} else {
+			if gap.Range().Length() >= minSize {
+				return gap
+			}
+		}
+		gap.index--
+	}
+	gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	if gap.node != nil && gap.index == 0 {
+		// If gap is the first gap of a non-leaf node, crawl up to
+		// parent again and do recursion.
+		gap.node, gap.index = gap.node.parent, gap.node.parentIndex
+	}
+	return gap.prevLargeEnoughGapHelper(minSize)
+}
+
 // segmentBeforePosition returns the predecessor segment of the position given
 // by n.children[i], which may or may not contain a child. If no such segment
 // exists, segmentBeforePosition returns a terminal iterator.
@@ -1271,7 +1613,7 @@ func segmentAfterPosition(n *node, i int) Iterator {
 
 func zeroValueSlice(slice []Value) {
 	// TODO(jamieliu): check if Go is actually smart enough to optimize a
-	// ClearValue that assigns nil to a memset here
+	// ClearValue that assigns nil to a memset here.
 	for i := range slice {
 		Functions{}.ClearValue(&slice[i])
 	}
@@ -1310,7 +1652,15 @@ func (n *node) writeDebugString(buf *bytes.Buffer, prefix string) {
 			child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
 		}
 		buf.WriteString(prefix)
-		buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+		if n.hasChildren {
+			if trackGaps != 0 {
+				buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get()))
+			} else {
+				buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+			}
+		} else {
+			buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+		}
 	}
 	if child := n.children[n.nrSegments]; child != nil {
 		child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
@@ -1362,3 +1712,43 @@ func (s *Set) ImportSortedSlices(sds *SegmentDataSlices) error {
 	}
 	return nil
 }
+
+// segmentTestCheck returns an error if s is incorrectly sorted, does not
+// contain exactly expectedSegments segments, or contains a segment which
+// fails the passed check.
+//
+// This should be used only for testing, and has been added to this package for
+// templating convenience.
+func (s *Set) segmentTestCheck(expectedSegments int, segFunc func(int, Range, Value) error) error {
+	havePrev := false
+	prev := Key(0)
+	nrSegments := 0
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		next := seg.Start()
+		if havePrev && prev >= next {
+			return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments)
+		}
+		if segFunc != nil {
+			if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil {
+				return err
+			}
+		}
+		prev = next
+		havePrev = true
+		nrSegments++
+	}
+	if nrSegments != expectedSegments {
+		return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments)
+	}
+	return nil
+}
+
+// countSegments counts the number of segments in the set.
+//
+// Similar to Check, this should only be used for testing.
+func (s *Set) countSegments() (segments int) {
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		segments++
+	}
+	return segments
+}
diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD
index f2d8462d8..131bf09b9 100644
--- a/pkg/segment/test/BUILD
+++ b/pkg/segment/test/BUILD
@@ -29,10 +29,28 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "gap_set",
+    out = "gap_set.go",
+    consts = {
+        "trackGaps": "1",
+    },
+    package = "segment",
+    prefix = "gap",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "int",
+        "Range": "Range",
+        "Value": "int",
+        "Functions": "gapSetFunctions",
+    },
+)
+
 go_library(
     name = "segment",
     testonly = 1,
     srcs = [
+        "gap_set.go",
         "int_range.go",
         "int_set.go",
         "set_functions.go",
diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go
index 97b16c158..85fa19096 100644
--- a/pkg/segment/test/segment_test.go
+++ b/pkg/segment/test/segment_test.go
@@ -17,6 +17,7 @@ package segment
 import (
 	"fmt"
 	"math/rand"
+	"reflect"
 	"testing"
 )
 
@@ -32,61 +33,65 @@ const (
 	// valueOffset is the difference between the value and start of test
 	// segments.
 	valueOffset = 100000
+
+	// intervalLength is the interval used by random gap tests.
+	intervalLength = 10
 )
 
 func shuffle(xs []int) {
-	for i := range xs {
-		j := rand.Intn(i + 1)
-		xs[i], xs[j] = xs[j], xs[i]
-	}
+	rand.Shuffle(len(xs), func(i, j int) { xs[i], xs[j] = xs[j], xs[i] })
 }
 
-func randPermutation(size int) []int {
+func randIntervalPermutation(size int) []int {
 	p := make([]int, size)
 	for i := range p {
-		p[i] = i
+		p[i] = intervalLength * i
 	}
 	shuffle(p)
 	return p
 }
 
-// checkSet returns an error if s is incorrectly sorted, does not contain
-// exactly expectedSegments segments, or contains a segment for which val !=
-// key + valueOffset.
-func checkSet(s *Set, expectedSegments int) error {
-	havePrev := false
-	prev := 0
-	nrSegments := 0
-	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		next := seg.Start()
-		if havePrev && prev >= next {
-			return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments)
-		}
-		if got, want := seg.Value(), seg.Start()+valueOffset; got != want {
-			return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start(), got, want)
-		}
-		prev = next
-		havePrev = true
-		nrSegments++
-	}
-	if nrSegments != expectedSegments {
-		return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments)
+// validate can be passed to Check.
+func validate(nr int, r Range, v int) error {
+	if got, want := v, r.Start+valueOffset; got != want {
+		return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nr, r.Start, got, want)
 	}
 	return nil
 }
 
-// countSegmentsIn returns the number of segments in s.
-func countSegmentsIn(s *Set) int {
-	var count int
-	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-		count++
+// checkSetMaxGap returns an error if maxGap inside all nodes of s is not well
+// maintained.
+func checkSetMaxGap(s *gapSet) error {
+	n := s.root
+	return checkNodeMaxGap(&n)
+}
+
+// checkNodeMaxGap returns an error if maxGap inside the subtree rooted by n is
+// not well maintained.
+func checkNodeMaxGap(n *gapnode) error {
+	var max int
+	if !n.hasChildren {
+		max = n.calculateMaxGapLeaf()
+	} else {
+		for i := 0; i <= n.nrSegments; i++ {
+			child := n.children[i]
+			if err := checkNodeMaxGap(child); err != nil {
+				return err
+			}
+			if temp := child.maxGap.Get(); i == 0 || temp > max {
+				max = temp
+			}
+		}
+	}
+	if max != n.maxGap.Get() {
+		return fmt.Errorf("maxGap wrong in node\n%vexpected: %d got: %d", n, max, n.maxGap)
 	}
-	return count
+	return nil
 }
 
 func TestAddRandom(t *testing.T) {
 	var s Set
-	order := randPermutation(testSize)
+	order := rand.Perm(testSize)
 	var nrInsertions int
 	for i, j := range order {
 		if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) {
@@ -94,12 +99,12 @@ func TestAddRandom(t *testing.T) {
 			break
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -115,7 +120,156 @@ func TestRemoveRandom(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 	}
-	order := randPermutation(testSize)
+	order := rand.Perm(testSize)
+	var nrRemovals int
+	for i, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
+			break
+		}
+		s.Remove(seg)
+		nrRemovals++
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestMaxGapAddRandom(t *testing.T) {
+	var s gapSet
+	order := rand.Perm(testSize)
+	var nrInsertions int
+	for i, j := range order {
+		if !s.AddWithoutMerging(Range{j, j + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		nrInsertions++
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order[:nrInsertions])
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapAddRandomWithRandomInterval(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize)
+	var nrInsertions int
+	for i, j := range order {
+		if !s.AddWithoutMerging(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		nrInsertions++
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order[:nrInsertions])
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapAddRandomWithMerge(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize)
+	nrInsertions := 1
+	for i, j := range order {
+		if !s.Add(Range{j, j + intervalLength}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), nrInsertions; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Insertion order: %v", order)
+		t.Logf("Set contents:\n%v", &s)
+	}
+}
+
+func TestMaxGapRemoveRandom(t *testing.T) {
+	var s gapSet
+	for i := 0; i < testSize; i++ {
+		if !s.AddWithoutMerging(Range{i, i + 1}, i+valueOffset) {
+			t.Fatalf("Failed to insert segment %d", i)
+		}
+	}
+	order := rand.Perm(testSize)
+	var nrRemovals int
+	for i, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
+			break
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		nrRemovals++
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
+			t.Errorf("Iteration %d: %v", i, err)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
+		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestMaxGapRemoveHalfRandom(t *testing.T) {
+	var s gapSet
+	for i := 0; i < testSize; i++ {
+		if !s.AddWithoutMerging(Range{intervalLength * i, intervalLength*i + rand.Intn(intervalLength-1) + 1}, intervalLength*i+valueOffset) {
+			t.Fatalf("Failed to insert segment %d", i)
+		}
+	}
+	order := randIntervalPermutation(testSize)
+	order = order[:testSize/2]
 	var nrRemovals int
 	for i, j := range order {
 		seg := s.FindSegment(j)
@@ -123,14 +277,19 @@ func TestRemoveRandom(t *testing.T) {
 			t.Errorf("Iteration %d: failed to find segment with key %d", i, j)
 			break
 		}
+		temprange := seg.Range()
 		s.Remove(seg)
 		nrRemovals++
-		if err := checkSet(&s, testSize-nrRemovals); err != nil {
+		if err := s.segmentTestCheck(testSize-nrRemovals, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
 	}
-	if got, want := countSegmentsIn(&s), testSize-nrRemovals; got != want {
+	if got, want := s.countSegments(), testSize-nrRemovals; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -140,6 +299,148 @@ func TestRemoveRandom(t *testing.T) {
 	}
 }
 
+func TestMaxGapAddRandomRemoveRandomHalfWithMerge(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + intervalLength}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	shuffle(order)
+	var nrRemovals int
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		nrRemovals++
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	if t.Failed() {
+		t.Logf("Removal order: %v", order[:nrRemovals])
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestNextLargeEnoughGap(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	shuffle(order)
+	order = order[:testSize/2]
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	minSize := 7
+	var gapArr1 []int
+	for gap := s.LowerBoundGap(0).NextLargeEnoughGap(minSize); gap.Ok(); gap = gap.NextLargeEnoughGap(minSize) {
+		if gap.Range().Length() < minSize {
+			t.Errorf("NextLargeEnoughGap wrong, gap %v has length %d, wanted %d", gap.Range(), gap.Range().Length(), minSize)
+		} else {
+			gapArr1 = append(gapArr1, gap.Range().Start)
+		}
+	}
+	var gapArr2 []int
+	for gap := s.LowerBoundGap(0).NextGap(); gap.Ok(); gap = gap.NextGap() {
+		if gap.Range().Length() >= minSize {
+			gapArr2 = append(gapArr2, gap.Range().Start)
+		}
+	}
+
+	if !reflect.DeepEqual(gapArr2, gapArr1) {
+		t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2)
+	}
+	if t.Failed() {
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
+func TestPrevLargeEnoughGap(t *testing.T) {
+	var s gapSet
+	order := randIntervalPermutation(testSize * 2)
+	order = order[:testSize]
+	for i, j := range order {
+		if !s.Add(Range{j, j + rand.Intn(intervalLength-1) + 1}, j+valueOffset) {
+			t.Errorf("Iteration %d: failed to insert segment with key %d", i, j)
+			break
+		}
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When inserting %d: %v", j, err)
+			break
+		}
+	}
+	end := s.LastSegment().End()
+	shuffle(order)
+	order = order[:testSize/2]
+	for _, j := range order {
+		seg := s.FindSegment(j)
+		if !seg.Ok() {
+			continue
+		}
+		temprange := seg.Range()
+		s.Remove(seg)
+		if err := checkSetMaxGap(&s); err != nil {
+			t.Errorf("When removing %v: %v", temprange, err)
+			break
+		}
+	}
+	minSize := 7
+	var gapArr1 []int
+	for gap := s.UpperBoundGap(end + intervalLength).PrevLargeEnoughGap(minSize); gap.Ok(); gap = gap.PrevLargeEnoughGap(minSize) {
+		if gap.Range().Length() < minSize {
+			t.Errorf("PrevLargeEnoughGap wrong, gap length %d, wanted %d", gap.Range().Length(), minSize)
+		} else {
+			gapArr1 = append(gapArr1, gap.Range().Start)
+		}
+	}
+	var gapArr2 []int
+	for gap := s.UpperBoundGap(end + intervalLength).PrevGap(); gap.Ok(); gap = gap.PrevGap() {
+		if gap.Range().Length() >= minSize {
+			gapArr2 = append(gapArr2, gap.Range().Start)
+		}
+	}
+	if !reflect.DeepEqual(gapArr2, gapArr1) {
+		t.Errorf("Search result not correct, got: %v, wanted: %v", gapArr1, gapArr2)
+	}
+	if t.Failed() {
+		t.Logf("Set contents:\n%v", &s)
+		t.FailNow()
+	}
+}
+
 func TestAddSequentialAdjacent(t *testing.T) {
 	var s Set
 	var nrInsertions int
@@ -148,12 +449,12 @@ func TestAddSequentialAdjacent(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -202,12 +503,12 @@ func TestAddSequentialNonAdjacent(t *testing.T) {
 			t.Fatalf("Failed to insert segment %d", i)
 		}
 		nrInsertions++
-		if err := checkSet(&s, nrInsertions); err != nil {
+		if err := s.segmentTestCheck(nrInsertions, validate); err != nil {
 			t.Errorf("Iteration %d: %v", i, err)
 			break
 		}
 	}
-	if got, want := countSegmentsIn(&s), nrInsertions; got != want {
+	if got, want := s.countSegments(), nrInsertions; got != want {
 		t.Errorf("Wrong final number of segments: got %d, wanted %d", got, want)
 	}
 	if t.Failed() {
@@ -293,7 +594,7 @@ Tests:
 		var i int
 		for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			if i > len(test.final) {
-				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s)
+				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s)
 				continue Tests
 			}
 			if got, want := seg.Range(), test.final[i]; got != want {
@@ -351,7 +652,7 @@ Tests:
 		var i int
 		for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
 			if i > len(test.final) {
-				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, countSegmentsIn(&s), len(test.final), &s)
+				t.Errorf("%s: Incorrect number of segments: got %d, wanted %d; set contents:\n%v", test.name, s.countSegments(), len(test.final), &s)
 				continue Tests
 			}
 			if got, want := seg.Range(), test.final[i]; got != want {
@@ -378,7 +679,7 @@ func benchmarkAddSequential(b *testing.B, size int) {
 }
 
 func benchmarkAddRandom(b *testing.B, size int) {
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
@@ -416,7 +717,7 @@ func benchmarkFindRandom(b *testing.B, size int) {
 			b.Fatalf("Failed to insert segment %d", i)
 		}
 	}
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
@@ -470,7 +771,7 @@ func benchmarkAddFindRemoveSequential(b *testing.B, size int) {
 }
 
 func benchmarkAddFindRemoveRandom(b *testing.B, size int) {
-	order := randPermutation(size)
+	order := rand.Perm(size)
 
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
diff --git a/pkg/segment/test/set_functions.go b/pkg/segment/test/set_functions.go
index bcddb39bb..7cd895cc7 100644
--- a/pkg/segment/test/set_functions.go
+++ b/pkg/segment/test/set_functions.go
@@ -14,21 +14,16 @@
 
 package segment
 
-// Basic numeric constants that we define because the math package doesn't.
-// TODO(nlacasse): These should be Math.MaxInt64/MinInt64?
-const (
-	maxInt = int(^uint(0) >> 1)
-	minInt = -maxInt - 1
-)
-
 type setFunctions struct{}
 
-func (setFunctions) MinKey() int {
-	return minInt
+// MinKey returns the minimum key for the set.
+func (s setFunctions) MinKey() int {
+	return -s.MaxKey() - 1
 }
 
+// MaxKey returns the maximum key for the set.
 func (setFunctions) MaxKey() int {
-	return maxInt
+	return int(^uint(0) >> 1)
 }
 
 func (setFunctions) ClearValue(*int) {}
@@ -40,3 +35,20 @@ func (setFunctions) Merge(_ Range, val1 int, _ Range, _ int) (int, bool) {
 func (setFunctions) Split(_ Range, val int, _ int) (int, int) {
 	return val, val
 }
+
+type gapSetFunctions struct {
+	setFunctions
+}
+
+// MinKey is adjusted to make sure no add overflow would happen in test cases.
+// e.g. A gap with range {MinInt32, 2} would cause overflow in Range().Length().
+//
+// Normally Keys should be unsigned to avoid these issues.
+func (s gapSetFunctions) MinKey() int {
+	return s.setFunctions.MinKey() / 2
+}
+
+// MaxKey returns the maximum key for the set.
+func (s gapSetFunctions) MaxKey() int {
+	return s.setFunctions.MaxKey() / 2
+}
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index 343f81f59..daba8b172 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -17,7 +17,6 @@
 package arch
 
 import (
-	"encoding/binary"
 	"fmt"
 	"io"
 
@@ -49,9 +48,14 @@ const ARMTrapFlag = uint64(1) << 21
 type aarch64FPState []byte
 
 // initAarch64FPState sets up initial state.
+//
+// Related code in Linux kernel: fpsimd_flush_thread().
+// FPCR = FPCR_RM_RN (0x0 << 22).
+//
+// Currently, aarch64FPState is only a space of 0x210 length for fpstate.
+// The fp head is useless in sentry/ptrace/kvm.
+//
 func initAarch64FPState(data aarch64FPState) {
-	binary.LittleEndian.PutUint32(data, fpsimdMagic)
-	binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize)
 }
 
 func newAarch64FPStateSlice() []byte {
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
index 92d062513..95dfd1e90 100644
--- a/pkg/sentry/arch/syscalls_arm64.go
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -23,7 +23,7 @@ const restartSyscallNr = uintptr(128)
 //
 // In linux, at the entry of the syscall handler(el0_svc_common()), value of R0
 // is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0
-// was not accessible to the user space application, so we have to do the same
+// was not accessible to the userspace application, so we have to do the same
 // operation in the sentry code to save the R0 value into the App context.
 func (c *context64) SyscallSaveOrig() {
 	c.OrigR0 = c.Regs.Regs[0]
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e74275d2d..2c5d14be5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,14 +16,12 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/context",
         "//pkg/fd",
-        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/fdimport",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/host",
-        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fs/user",
         "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -35,7 +33,6 @@ go_library(
         "//pkg/sentry/vfs",
         "//pkg/sentry/watchdog",
         "//pkg/sync",
-        "//pkg/syserror",
         "//pkg/tcpip/link/sniffer",
         "//pkg/urpc",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 2ed17ee09..1bae7cfaf 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,7 +18,6 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"path"
 	"sort"
 	"strings"
 	"text/tabwriter"
@@ -26,13 +25,10 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fdimport"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
-	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -40,7 +36,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/limits"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/urpc"
 )
 
@@ -108,6 +103,9 @@ type ExecArgs struct {
 
 // String prints the arguments as a string.
 func (args ExecArgs) String() string {
+	if len(args.Argv) == 0 {
+		return args.Filename
+	}
 	a := make([]string, len(args.Argv))
 	copy(a, args.Argv)
 	if args.Filename != "" {
@@ -180,42 +178,30 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
-	if initArgs.Filename == "" {
-		if kernel.VFS2Enabled {
-			// Get the full path to the filename from the PATH env variable.
-			if initArgs.MountNamespaceVFS2 == nil {
-				// Set initArgs so that 'ctx' returns the namespace.
-				//
-				// MountNamespaceVFS2 adds a reference to the namespace, which is
-				// transferred to the new process.
-				initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
-			}
+	if kernel.VFS2Enabled {
+		// Get the full path to the filename from the PATH env variable.
+		if initArgs.MountNamespaceVFS2 == nil {
+			// Set initArgs so that 'ctx' returns the namespace.
+			//
+			// MountNamespaceVFS2 adds a reference to the namespace, which is
+			// transferred to the new process.
+			initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+		}
+	} else {
+		if initArgs.MountNamespace == nil {
+			// Set initArgs so that 'ctx' returns the namespace.
+			initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
 
-			paths := fs.GetPath(initArgs.Envv)
-			vfsObj := proc.Kernel.VFS()
-			file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
-			}
-			initArgs.File = fsbridge.NewVFSFile(file)
-		} else {
-			// Get the full path to the filename from the PATH env variable.
-			paths := fs.GetPath(initArgs.Envv)
-			if initArgs.MountNamespace == nil {
-				// Set initArgs so that 'ctx' returns the namespace.
-				initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
-
-				// initArgs must hold a reference on MountNamespace, which will
-				// be donated to the new process in CreateProcess.
-				initArgs.MountNamespace.IncRef()
-			}
-			f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
-			if err != nil {
-				return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
-			}
-			initArgs.Filename = f
+			// initArgs must hold a reference on MountNamespace, which will
+			// be donated to the new process in CreateProcess.
+			initArgs.MountNamespace.IncRef()
 		}
 	}
+	resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
+	if err != nil {
+		return nil, 0, nil, nil, err
+	}
+	initArgs.Filename = resolved
 
 	fds := make([]int, len(args.FilePayload.Files))
 	for i, file := range args.FilePayload.Files {
@@ -428,67 +414,3 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
-	root := vfs.RootFromContext(ctx)
-	defer root.DecRef()
-	creds := auth.CredentialsFromContext(ctx)
-
-	// Absolute paths can be used directly.
-	if path.IsAbs(name) {
-		return openExecutable(ctx, vfsObj, creds, root, name)
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(name, '/') > 0 {
-		if len(wd) == 0 {
-			wd = "/"
-		}
-		if !path.IsAbs(wd) {
-			return nil, fmt.Errorf("working directory %q must be absolute", wd)
-		}
-		return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
-	}
-
-	// Otherwise, we must lookup the name in the paths, starting from the
-	// calling context's root directory.
-	for _, p := range paths {
-		if !path.IsAbs(p) {
-			// Relative paths aren't safe, no one should be using them.
-			log.Warningf("Skipping relative path %q in $PATH", p)
-			continue
-		}
-
-		binPath := path.Join(p, name)
-		f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
-		if err != nil {
-			return nil, err
-		}
-		if f == nil {
-			continue // Not found/no access.
-		}
-		return f, nil
-	}
-	return nil, syserror.ENOENT
-}
-
-func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root, // binPath is absolute, Start can be anything.
-		Path:               fspath.Parse(path),
-		FollowFinalSymlink: true,
-	}
-	opts := &vfs.OpenOptions{
-		Flags:    linux.O_RDONLY,
-		FileExec: true,
-	}
-	f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
-	if err == syserror.ENOENT || err == syserror.EACCES {
-		return nil, nil
-	}
-	return f, err
-}
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index c7e197691..af66fe4dc 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -42,6 +42,7 @@ type fullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 33d060d02..92d3d71be 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -43,6 +43,7 @@ type nullFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index acfa23149..6b81da5ef 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -48,6 +48,7 @@ type randomFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// off is the "file offset". off is accessed using atomic memory
 	// operations.
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 3b1372b9e..c6f15054d 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -44,6 +44,7 @@ type zeroFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 846252c89..ca41520b4 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -146,7 +146,7 @@ func (f *File) DecRef() {
 	f.DecRefWithDestructor(func() {
 		// Drop BSD style locks.
 		lockRng := lock.LockRange{Start: 0, End: lock.LockEOF}
-		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng)
+		f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng)
 
 		// Release resources held by the FileOperations.
 		f.FileOperations.Release()
@@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error
 	if !f.mu.Lock(ctx) {
 		return 0, syserror.ErrInterrupted
 	}
-
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	// Handle append mode.
 	if f.Flags().Append {
@@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	// offset."
 	unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append)
 	defer unlockAppendMu()
-
 	if f.Flags().Append {
 		if err := f.offsetForAppend(ctx, &offset); err != nil {
 			return 0, err
@@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64
 	return f.FileOperations.Write(ctx, f, src, offset)
 }
 
-// offsetForAppend sets the given offset to the end of the file.
+// offsetForAppend atomically sets the given offset to the end of the file.
 //
-// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing.
+// Precondition: the file.Dirent.Inode.appendMu mutex should be held for
+// writing.
 func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	uattr, err := f.Dirent.Inode.UnstableAttr(ctx)
 	if err != nil {
@@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error {
 	}
 
 	// Update the offset.
-	*offset = uattr.Size
+	atomic.StoreInt64(offset, uattr.Size)
 
 	return nil
 }
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index bdba6efe5..d2dbff268 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -42,9 +42,10 @@
 //     Dirent.dirMu
 //       Dirent.mu
 //         DirentCache.mu
-//         Locks in InodeOperations implementations or overlayEntry
 //         Inode.Watches.mu (see `Inotify` for other lock ordering)
 //         MountSource.mu
+//         Inode.appendMu
+//           Locks in InodeOperations implementations or overlayEntry
 //
 // If multiple Dirent or MountSource locks must be taken, locks in the parent must be
 // taken before locks in their children.
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index 6564fd0c6..dd6f5aba6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -18,6 +18,7 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
 // FrameRefSetFunctions implements segment.Functions for FrameRefSet.
@@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
 func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
 	return val, val
 }
+
+// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
+// are accounted as host page cache memory mappings.
+func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+	seg, gap := refs.Find(fr.Start)
+	for {
+		switch {
+		case seg.Ok() && seg.Start() < fr.End:
+			seg = refs.Isolate(seg, fr)
+			seg.SetValue(seg.Value() + 1)
+			seg, gap = seg.NextNonEmpty()
+		case gap.Ok() && gap.Start() < fr.End:
+			newRange := gap.Range().Intersect(fr)
+			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+			seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+		default:
+			refs.MergeAdjacent(fr)
+			return
+		}
+	}
+}
+
+// DecRefAndAccount removes a reference on the range fr and untracks segments
+// that are removed from memory accounting.
+func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+	seg := refs.FindSegment(fr.Start)
+
+	for seg.Ok() && seg.Start() < fr.End {
+		seg = refs.Isolate(seg, fr)
+		if old := seg.Value(); old == 1 {
+			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+			seg = refs.Remove(seg).NextSegment()
+		} else {
+			seg.SetValue(old - 1)
+			seg = seg.NextSegment()
+		}
+	}
+	refs.MergeAdjacent(fr)
+}
diff --git a/pkg/sentry/fs/g3doc/.gitignore b/pkg/sentry/fs/g3doc/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
new file mode 100644
index 000000000..2ca84dd74
--- /dev/null
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -0,0 +1,263 @@
+# Foreword
+
+This document describes an on-going project to support FUSE filesystems within
+the sentry. This is intended to become the final documentation for this
+subsystem, and is therefore written in the past tense. However FUSE support is
+currently incomplete and the document will be updated as things progress.
+
+# FUSE: Filesystem in Userspace
+
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
+
+## Overview
+
+FUSE has two main components:
+
+1.  A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+    filesystem operations (usually initiated by syscalls) to the server.
+
+2.  A server, which is a userspace daemon that implements the actual filesystem.
+
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
+
+A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
+utility like `fusermount(1)`. Various mount options exist for establishing
+ownership and access permissions on the filesystem, but the most important mount
+option is a file descriptor used to establish communication between the client
+and server.
+
+The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
+the client and server use the FUSE protocol described in `fuse(4)` to service
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
+
+## FUSE in the Sentry
+
+The sentry's FUSE client targets VFS2 and has the following components:
+
+-   An implementation of `/dev/fuse`.
+
+-   A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+    VFS2, one point of contention may be the lack of inodes in VFS2. We can
+    tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+    The kernfs base functionality can serve the role of the Linux inode cache
+    and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+    the `kernfs.Inode` interface.
+
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
+
+### Design Goals
+
+-   While filesystem performance is always important, the sentry's FUSE support
+    is primarily concerned with compatibility, with performance as a secondary
+    concern.
+
+-   Avoiding deadlocks from a hung server daemon.
+
+-   Consider the potential for denial of service from a malicious server daemon.
+    Protecting itself from userspace is already a design goal for the sentry,
+    but needs additional consideration for FUSE. Normally, an operating system
+    doesn't rely on userspace to make progress with filesystem operations. Since
+    this changes with FUSE, it opens up the possibility of creating a chain of
+    dependencies controlled by userspace, which could affect an entire sandbox.
+    For example: a FUSE op can block a syscall, which could be holding a
+    subsystem lock, which can then block another task goroutine.
+
+### Milestones
+
+Below are some broad goals to aim for while implementing FUSE in the sentry.
+Many FUSE ops can be grouped into broad categories of functionality, and most
+ops can be implemented in parallel.
+
+#### Minimal client that can mount a trivial FUSE filesystem.
+
+-   Implement `/dev/fuse` - a character device used to establish an FD for
+    communication between the sentry and the server daemon.
+
+-   Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+
+#### Read-only mount with basic file operations
+
+-   Implement the majority of file, directory and file descriptor FUSE ops. For
+    this milestone, we can skip uncommon or complex operations like mmap, mknod,
+    file locking, poll, and extended attributes. We can stub these out along
+    with any ops that modify the filesystem. The exact list of required ops are
+    to be determined, but the goal is to mount a real filesystem as read-only,
+    and be able to read contents from the filesystem in the sentry.
+
+#### Full read-write support
+
+-   Implement the remaining FUSE ops and decide if we can omit rarely used
+    operations like ioctl.
+
+# Appendix
+
+## FUSE Protocol
+
+The FUSE protocol is a request-response protocol. All requests are initiated by
+the client. The wire-format for the protocol is raw C structs serialized to
+memory.
+
+All FUSE requests begin with the following request header:
+
+```c
+struct fuse_in_header {
+  uint32_t len;       // Length of the request, including this header.
+  uint32_t opcode;    // Requested operation.
+  uint64_t unique;    // A unique identifier for this request.
+  uint64_t nodeid;    // ID of the filesystem object being operated on.
+  uint32_t uid;       // UID of the requesting process.
+  uint32_t gid;       // GID of the requesting process.
+  uint32_t pid;       // PID of the requesting process.
+  uint32_t padding;
+};
+```
+
+The request is then followed by a payload specific to the `opcode`.
+
+All responses begin with this response header:
+
+```c
+struct fuse_out_header {
+  uint32_t len;       // Length of the response, including this header.
+  int32_t  error;     // Status of the request, 0 if success.
+  uint64_t unique;    // The unique identifier from the corresponding request.
+};
+```
+
+The response payload also depends on the request `opcode`. If `error != 0`, the
+response payload must be empty.
+
+### Operations
+
+The following is a list of all FUSE operations used in `fuse_in_header.opcode`
+as of Linux v4.4, and a brief description of their purpose. These are defined in
+`uapi/linux/fuse.h`. Many of these have a corresponding request and response
+payload struct; `fuse(4)` has details for some of these. We also note how these
+operations map to the sentry virtual filesystem.
+
+#### FUSE meta-operations
+
+These operations are specific to FUSE and don't have a corresponding action in a
+generic filesystem.
+
+-   `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+    first message sent by the client after mount. This is used for version and
+    feature negotiation. This is related to `mount(2)`.
+-   `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+-   `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+    `fuse_in_header.unique` value provided in the corresponding request header.
+    The client can send at most one of these per request, and will enter an
+    uninterruptible wait for a reply. The server is expected to reply promptly.
+-   `FUSE_FORGET`: A hint to the server that server should evict the indicate
+    node from any caches. This is wired up to `(struct
+    super_operations).evict_inode` in Linux, which is in turned hooked as the
+    inode cache shrinker which is typically triggered by system memory pressure.
+-   `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+
+#### Filesystem Syscalls
+
+These FUSE ops map directly to an equivalent filesystem syscall, or family of
+syscalls. The relevant syscalls have a similar name to the operation, unless
+otherwise noted.
+
+Node creation:
+
+-   `FUSE_MKNOD`
+-   `FUSE_MKDIR`
+-   `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+    atomically creates and opens a node.
+
+Node attributes and extended attributes:
+
+-   `FUSE_GETATTR`
+-   `FUSE_SETATTR`
+-   `FUSE_SETXATTR`
+-   `FUSE_GETXATTR`
+-   `FUSE_LISTXATTR`
+-   `FUSE_REMOVEXATTR`
+
+Node link manipulation:
+
+-   `FUSE_READLINK`
+-   `FUSE_LINK`
+-   `FUSE_SYMLINK`
+-   `FUSE_UNLINK`
+
+Directory operations:
+
+-   `FUSE_RMDIR`
+-   `FUSE_RENAME`
+-   `FUSE_RENAME2`
+-   `FUSE_OPENDIR`: `open(2)` for directories.
+-   `FUSE_RELEASEDIR`: `close(2)` for directories.
+-   `FUSE_READDIR`
+-   `FUSE_READDIRPLUS`
+-   `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+-   `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+    reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+    component to a node. However the returned identifier is opaque to the
+    client. The server must remember this mapping, as this is how the client
+    will reference the node in the future.
+
+File operations:
+
+-   `FUSE_OPEN`: `open(2)` for files.
+-   `FUSE_RELEASE`: `close(2)` for files.
+-   `FUSE_FSYNC`
+-   `FUSE_FALLOCATE`
+-   `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+-   `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+
+File locking:
+
+-   `FUSE_GETLK`
+-   `FUSE_SETLK`
+-   `FUSE_SETLKW`
+-   `FUSE_COPY_FILE_RANGE`
+
+File descriptor operations:
+
+-   `FUSE_IOCTL`
+-   `FUSE_POLL`
+-   `FUSE_LSEEK`
+
+Filesystem operations:
+
+-   `FUSE_STATFS`
+
+#### Permissions
+
+-   `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+    syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+    sentry.
+
+#### I/O Operations
+
+These ops are used to read and write file pages. They're used to implement both
+I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
+
+-   `FUSE_READ`
+-   `FUSE_WRITE`
+
+#### Miscellaneous
+
+-   `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+    closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+    syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+    sentry.
+-   `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+-   `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+
+# References
+
+-   [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html)
+-   [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html)
+-   [The reference implementation of the Linux FUSE (Filesystem in Userspace)
+    interface](https://github.com/libfuse/libfuse)
+-   [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h)
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index a016c896e..51d7368a1 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -640,7 +640,7 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
 
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
-	if !i.session().cachePolicy.cacheUAttrs(inode) {
+	if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) {
 		return nil
 	}
 
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 62f1246aa..fbfba1b58 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -368,6 +368,9 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset,
 
 // WriteOut implements fs.InodeOperations.WriteOut.
 func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+	if inode.MountSource.Flags.ReadOnly {
+		return nil
+	}
 	// Have we been using host kernel metadata caches?
 	if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) {
 		// Then the metadata is already up to date on the host.
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 926538d90..8a5d9c7eb 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -62,7 +62,7 @@ import (
 type LockType int
 
 // UniqueID is a unique identifier of the holder of a regional file lock.
-type UniqueID uint64
+type UniqueID interface{}
 
 const (
 	// ReadLock describes a POSIX regional file lock to be taken
@@ -98,12 +98,7 @@ type Lock struct {
 	// If len(Readers) > 0 then HasWriter must be false.
 	Readers map[UniqueID]bool
 
-	// HasWriter indicates that this is a write lock held by a single
-	// UniqueID.
-	HasWriter bool
-
-	// Writer is only valid if HasWriter is true.  It identifies a
-	// single write lock holder.
+	// Writer holds the writer unique ID. It's nil if there are no writers.
 	Writer UniqueID
 }
 
@@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock {
 	case ReadLock:
 		value.Readers[uid] = true
 	case WriteLock:
-		value.HasWriter = true
 		value.Writer = uid
 	default:
 		panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
@@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock {
 
 // isHeld returns true if uid is a holder of Lock.
 func (l Lock) isHeld(uid UniqueID) bool {
-	if l.HasWriter && l.Writer == uid {
-		return true
-	}
-	return l.Readers[uid]
+	return l.Writer == uid || l.Readers[uid]
 }
 
 // lock sets uid as a holder of a typed lock on Lock.
@@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
 		}
 		// We cannot downgrade a write lock to a read lock unless the
 		// uid is the same.
-		if l.HasWriter {
+		if l.Writer != nil {
 			if l.Writer != uid {
 				panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
 			}
 			// Ensure that there is only one reader if upgrading.
 			l.Readers = make(map[UniqueID]bool)
 			// Ensure that there is no longer a writer.
-			l.HasWriter = false
+			l.Writer = nil
 		}
 		l.Readers[uid] = true
 		return
 	case WriteLock:
 		// If we are already the writer, then this is a no-op.
-		if l.HasWriter && l.Writer == uid {
+		if l.Writer == uid {
 			return
 		}
 		// We can only upgrade a read lock to a write lock if there
@@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) {
 		}
 		// Ensure that there is only a writer.
 		l.Readers = make(map[UniqueID]bool)
-		l.HasWriter = true
 		l.Writer = uid
 	default:
 		panic(fmt.Sprintf("lock: invalid lock type %d", t))
@@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	switch t {
 	case ReadLock:
 		return l.lockable(r, func(value Lock) bool {
-			// If there is no writer, there's no problem adding
-			// another reader.
-			if !value.HasWriter {
+			// If there is no writer, there's no problem adding another reader.
+			if value.Writer == nil {
 				return true
 			}
 			// If there is a writer, then it must be the same uid
@@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	case WriteLock:
 		return l.lockable(r, func(value Lock) bool {
 			// If there are only readers.
-			if !value.HasWriter {
-				// Then this uid can only take a write lock if
-				// this is a private upgrade, meaning that the
-				// only reader is uid.
+			if value.Writer == nil {
+				// Then this uid can only take a write lock if this is a private
+				// upgrade, meaning that the only reader is uid.
 				return len(value.Readers) == 1 && value.Readers[uid]
 			}
 			// If the uid is already a writer on this region, then
@@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
 	}
 }
 
-// lock returns true if uid took a lock of type t on the entire range of LockRange.
+// lock returns true if uid took a lock of type t on the entire range of
+// LockRange.
 //
 // Preconditions: r.Start <= r.End (will panic otherwise).
 func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
@@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool {
 			seg, _ = l.SplitUnchecked(seg, r.End)
 		}
 
-		// Set the lock on the segment.  This is guaranteed to
+		// Set the lock on the segment. This is guaranteed to
 		// always be safe, given canLock above.
 		value := seg.ValuePtr()
 		value.lock(uid, t)
@@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) {
 
 		value := seg.Value()
 		var remove bool
-		if value.HasWriter && value.Writer == uid {
+		if value.Writer == uid {
 			// If we are unlocking a writer, then since there can
 			// only ever be one writer and no readers, then this
 			// lock should always be removed from the set.
diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go
index 8a3ace0c1..50a16e662 100644
--- a/pkg/sentry/fs/lock/lock_set_functions.go
+++ b/pkg/sentry/fs/lock/lock_set_functions.go
@@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock)
 			return Lock{}, false
 		}
 	}
-	if val1.HasWriter != val2.HasWriter {
+	if val1.Writer != val2.Writer {
 		return Lock{}, false
 	}
-	if val1.HasWriter {
-		if val1.Writer != val2.Writer {
-			return Lock{}, false
-		}
-	}
 	return val1, true
 }
 
@@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock)
 	for k, v := range val.Readers {
 		val0.Readers[k] = v
 	}
-	val0.HasWriter = val.HasWriter
 	val0.Writer = val.Writer
 
 	return val, val0
diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go
index ba002aeb7..fad90984b 100644
--- a/pkg/sentry/fs/lock/lock_test.go
+++ b/pkg/sentry/fs/lock/lock_test.go
@@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool {
 		if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) {
 			return false
 		}
-		if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter {
-			return false
-		}
 		if e0[i].Lock.Writer != e1[i].Lock.Writer {
 			return false
 		}
@@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) {
 			LockRange: LockRange{2048, 3072},
 		},
 		{
-			Lock:      Lock{HasWriter: true, Writer: 1},
+			Lock:      Lock{Writer: 1},
 			LockRange: LockRange{3072, 4096},
 		},
 	})
@@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 4096},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{4096, LockEOF},
 				},
 			},
@@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) {
 			// 0          4096                    max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 4096},
 				},
 				{
@@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 4096},
 				},
 				{
@@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 3072},
 				},
 				{
@@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) {
 			// 0            1024        2048        4096   max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) {
 			// 0            1024                     max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, LockEOF},
 				},
 			},
@@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		l := fill(test.before)
+		t.Run(test.name, func(t *testing.T) {
+			l := fill(test.before)
 
-		r := LockRange{Start: test.start, End: test.end}
-		success := l.lock(test.uid, test.lockType, r)
-		var got []entry
-		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			got = append(got, entry{
-				Lock:      seg.Value(),
-				LockRange: seg.Range(),
-			})
-		}
+			r := LockRange{Start: test.start, End: test.end}
+			success := l.lock(test.uid, test.lockType, r)
+			var got []entry
+			for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+				got = append(got, entry{
+					Lock:      seg.Value(),
+					LockRange: seg.Range(),
+				})
+			}
 
-		if success != test.success {
-			t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success)
-			continue
-		}
+			if success != test.success {
+				t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success)
+				return
+			}
 
-		if success {
-			if !equals(got, test.after) {
-				t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
+			if success {
+				if !equals(got, test.after) {
+					t.Errorf("got set %+v, want %+v", got, test.after)
+				}
 			}
-		}
+		})
 	}
 }
 
@@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) {
 			// 0     4096                    max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{4096, LockEOF},
 				},
 			},
@@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) {
 			// 0                                  max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, LockEOF},
 				},
 			},
@@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) {
 			// 0                          4096
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 4096},
 				},
 			},
@@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) {
 					LockRange: LockRange{0, 1024},
 				},
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{1024, 4096},
 				},
 				{
@@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) {
 			// 0        1024    4096            max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) {
 			// 0           8    4096            max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 8},
 				},
 				{
@@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) {
 			// 0        1024    4096            max uint64
 			before: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) {
 			// 0       1024     4096        8192      max uint64
 			after: []entry{
 				{
-					Lock:      Lock{HasWriter: true, Writer: 0},
+					Lock:      Lock{Writer: 0},
 					LockRange: LockRange{0, 1024},
 				},
 				{
@@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) {
 	}
 
 	for _, test := range tests {
-		l := fill(test.before)
+		t.Run(test.name, func(t *testing.T) {
+			l := fill(test.before)
 
-		r := LockRange{Start: test.start, End: test.end}
-		l.unlock(test.uid, r)
-		var got []entry
-		for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
-			got = append(got, entry{
-				Lock:      seg.Value(),
-				LockRange: seg.Range(),
-			})
-		}
-		if !equals(got, test.after) {
-			t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after)
-		}
+			r := LockRange{Start: test.start, End: test.end}
+			l.unlock(test.uid, r)
+			var got []entry
+			for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+				got = append(got, entry{
+					Lock:      seg.Value(),
+					LockRange: seg.Range(),
+				})
+			}
+			if !equals(got, test.after) {
+				t.Errorf("got set %+v, want %+v", got, test.after)
+			}
+		})
 	}
 }
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index b414ddaee..3f2bd0e87 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -17,13 +17,9 @@ package fs
 import (
 	"fmt"
 	"math"
-	"path"
-	"strings"
 	"syscall"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -625,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) {
 	defer mns.mu.Unlock()
 	mns.root.SyncAll(ctx)
 }
-
-// ResolveExecutablePath resolves the given executable name given a set of
-// paths that might contain it.
-func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) {
-	// Absolute paths can be used directly.
-	if path.IsAbs(name) {
-		return name, nil
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(name, '/') > 0 {
-		if wd == "" {
-			wd = "/"
-		}
-		if !path.IsAbs(wd) {
-			return "", fmt.Errorf("working directory %q must be absolute", wd)
-		}
-		return path.Join(wd, name), nil
-	}
-
-	// Otherwise, We must lookup the name in the paths, starting from the
-	// calling context's root directory.
-	root := RootFromContext(ctx)
-	if root == nil {
-		// Caller has no root. Don't bother traversing anything.
-		return "", syserror.ENOENT
-	}
-	defer root.DecRef()
-	for _, p := range paths {
-		binPath := path.Join(p, name)
-		traversals := uint(linux.MaxSymlinkTraversals)
-		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			// Didn't find it here.
-			continue
-		}
-		if err != nil {
-			return "", err
-		}
-		defer d.DecRef()
-
-		// Check that it is a regular file.
-		if !IsRegular(d.Inode.StableAttr) {
-			continue
-		}
-
-		// Check whether we can read and execute the found file.
-		if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil {
-			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
-			continue
-		}
-		return path.Join("/", p, name), nil
-	}
-	return "", syserror.ENOENT
-}
-
-// GetPath returns the PATH as a slice of strings given the environment
-// variables.
-func GetPath(env []string) []string {
-	const prefix = "PATH="
-	for _, e := range env {
-		if strings.HasPrefix(e, prefix) {
-			return strings.Split(strings.TrimPrefix(e, prefix), ":")
-		}
-	}
-	return nil
-}
diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD
index f37f979f1..66e949c95 100644
--- a/pkg/sentry/fs/user/BUILD
+++ b/pkg/sentry/fs/user/BUILD
@@ -4,15 +4,21 @@ package(licenses = ["notice"])
 
 go_library(
     name = "user",
-    srcs = ["user.go"],
+    srcs = [
+        "path.go",
+        "user.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
+        "//pkg/syserror",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go
new file mode 100644
index 000000000..397e96045
--- /dev/null
+++ b/pkg/sentry/fs/user/path.go
@@ -0,0 +1,170 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package user
+
+import (
+	"fmt"
+	"path"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolveExecutablePath resolves the given executable name given the working
+// dir and environment.
+func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) {
+	name := args.Filename
+	if len(name) == 0 {
+		if len(args.Argv) == 0 {
+			return "", fmt.Errorf("no filename or command provided")
+		}
+		name = args.Argv[0]
+	}
+
+	// Absolute paths can be used directly.
+	if path.IsAbs(name) {
+		return name, nil
+	}
+
+	// Paths with '/' in them should be joined to the working directory, or
+	// to the root if working directory is not set.
+	if strings.IndexByte(name, '/') > 0 {
+		wd := args.WorkingDirectory
+		if wd == "" {
+			wd = "/"
+		}
+		if !path.IsAbs(wd) {
+			return "", fmt.Errorf("working directory %q must be absolute", wd)
+		}
+		return path.Join(wd, name), nil
+	}
+
+	// Otherwise, We must lookup the name in the paths.
+	paths := getPath(args.Envv)
+	if kernel.VFS2Enabled {
+		f, err := resolveVFS2(ctx, args.Credentials, args.MountNamespaceVFS2, paths, name)
+		if err != nil {
+			return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+		}
+		return f, nil
+	}
+
+	f, err := resolve(ctx, args.MountNamespace, paths, name)
+	if err != nil {
+		return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)
+	}
+	return f, nil
+}
+
+func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name string) (string, error) {
+	root := fs.RootFromContext(ctx)
+	if root == nil {
+		// Caller has no root. Don't bother traversing anything.
+		return "", syserror.ENOENT
+	}
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		traversals := uint(linux.MaxSymlinkTraversals)
+		d, err := mns.FindInode(ctx, root, nil, binPath, &traversals)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		defer d.DecRef()
+
+		// Check that it is a regular file.
+		if !fs.IsRegular(d.Inode.StableAttr) {
+			continue
+		}
+
+		// Check whether we can read and execute the found file.
+		if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil {
+			log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err)
+			continue
+		}
+		return path.Join("/", p, name), nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) {
+	root := mns.Root()
+	defer root.DecRef()
+	for _, p := range paths {
+		if !path.IsAbs(p) {
+			// Relative paths aren't safe, no one should be using them.
+			log.Warningf("Skipping relative path %q in $PATH", p)
+			continue
+		}
+
+		binPath := path.Join(p, name)
+		pop := &vfs.PathOperation{
+			Root:               root,
+			Start:              root,
+			Path:               fspath.Parse(binPath),
+			FollowFinalSymlink: true,
+		}
+		opts := &vfs.OpenOptions{
+			FileExec: true,
+			Flags:    linux.O_RDONLY,
+		}
+		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+		if err == syserror.ENOENT || err == syserror.EACCES {
+			// Didn't find it here.
+			continue
+		}
+		if err != nil {
+			return "", err
+		}
+		dentry.DecRef()
+
+		return binPath, nil
+	}
+
+	// Couldn't find it.
+	return "", syserror.ENOENT
+}
+
+// getPath returns the PATH as a slice of strings given the environment
+// variables.
+func getPath(env []string) []string {
+	const prefix = "PATH="
+	for _, e := range env {
+		if strings.HasPrefix(e, prefix) {
+			return strings.Split(strings.TrimPrefix(e, prefix), ":")
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go
index fe7f67c00..f4d525523 100644
--- a/pkg/sentry/fs/user/user.go
+++ b/pkg/sentry/fs/user/user.go
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package user contains methods for resolving filesystem paths based on the
+// user and their environment.
 package user
 
 import (
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 585764223..cf440dce8 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -23,6 +23,7 @@ go_library(
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index c03c65445..9b0e0cca2 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -28,6 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -116,6 +117,8 @@ type rootInode struct {
 	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	locks lock.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -183,7 +186,7 @@ func (i *rootInode) masterClose(t *Terminal) {
 
 // Open implements kernfs.Inode.Open.
 func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index e201801d6..f7bc325d1 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -27,8 +27,6 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// LINT.IfChange
-
 const (
 	// canonMaxBytes is the number of bytes that fit into a single line of
 	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
@@ -445,5 +443,3 @@ func (l *lineDiscipline) peek(b []byte) int {
 	}
 	return size
 }
-
-// LINT.ThenChange(../../fs/tty/line_discipline.go)
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 04a292927..1d22adbe3 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -22,13 +22,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// LINT.IfChange
-
 // masterInode is the inode for the master end of the Terminal.
 type masterInode struct {
 	kernfs.InodeAttrs
@@ -36,6 +35,8 @@ type masterInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks lock.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -57,6 +58,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf
 		inode: mi,
 		t:     t,
 	}
+	fd.LockFD.Init(&mi.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		mi.DecRef()
 		return nil, err
@@ -87,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *masterInode
 	t     *Terminal
@@ -222,5 +225,3 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
 		unimpl.EmitUnimplementedEvent(ctx)
 	}
 }
-
-// LINT.ThenChange(../../fs/tty/master.go)
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index 29a6be858..dffb4232c 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -25,8 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// LINT.IfChange
-
 // waitBufMaxBytes is the maximum size of a wait buffer. It is based on
 // TTYB_DEFAULT_MEM_LIMIT.
 const waitBufMaxBytes = 131072
@@ -236,5 +234,3 @@ func (q *queue) waitBufAppend(b []byte) {
 	q.waitBuf = append(q.waitBuf, b)
 	q.waitBufLen += uint64(len(b))
 }
-
-// LINT.ThenChange(../../fs/tty/queue.go)
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
index 0a98dc896..7fe475080 100644
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -21,13 +21,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// LINT.IfChange
-
 // slaveInode is the inode for the slave end of the Terminal.
 type slaveInode struct {
 	kernfs.InodeAttrs
@@ -35,6 +34,8 @@ type slaveInode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks lock.FileLocks
+
 	// Keep a reference to this inode's dentry.
 	dentry kernfs.Dentry
 
@@ -53,6 +54,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs
 	fd := &slaveFileDescription{
 		inode: si,
 	}
+	fd.LockFD.Init(&si.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		si.DecRef()
 		return nil, err
@@ -93,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 type slaveFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	inode *slaveInode
 }
@@ -182,5 +185,3 @@ func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions)
 	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
 	return sfd.inode.Stat(fs, opts)
 }
-
-// LINT.ThenChange(../../fs/tty/slave.go)
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index b44e673d8..7d2781c54 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -22,8 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-// LINT.IfChanges
-
 // Terminal is a pseudoterminal.
 //
 // +stateify savable
@@ -120,5 +118,3 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	}
 	return tm.slaveKTTY
 }
-
-// LINT.ThenChange(../../fs/tty/terminal.go)
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index c573d7935..d12d78b84 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -37,6 +37,7 @@ type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// queue is used to notify interested parties when the event object
 	// becomes readable or writable.
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index ff861d0fe..973fa0def 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -60,6 +60,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls/linux",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index bfbd7c3d4..6bd1a9fc6 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -60,3 +60,15 @@ func (d *dentry) DecRef() {
 	// inode.decRef().
 	d.inode.decRef()
 }
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 92f7da40d..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,6 +26,7 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 485f86f4b..e4b434b13 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -54,6 +55,8 @@ type inode struct {
 	// diskInode gives us access to the inode struct on disk. Immutable.
 	diskInode disklayout.Inode
 
+	locks lock.FileLocks
+
 	// This is immutable. The first field of the implementations must have inode
 	// as the first field to ensure temporality.
 	impl interface{}
@@ -157,6 +160,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -168,6 +172,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
+		fd.LockFD.Init(&in.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -178,6 +183,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
+		fd.LockFD.Init(&in.locks)
 		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index 30135ddb0..f7015c44f 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -77,6 +77,7 @@ func (in *inode) isRegular() bool {
 // vfs.FileDescriptionImpl.
 type regularFileFD struct {
 	fileDescription
+	vfs.LockFD
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	off int64
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 5ce82b793..5cdeeaeb5 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -35,8 +35,8 @@ go_library(
         "fstree.go",
         "gofer.go",
         "handle.go",
+        "host_named_pipe.go",
         "p9file.go",
-        "pagemath.go",
         "regular_file.go",
         "socket.go",
         "special_file.go",
@@ -48,11 +48,13 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fd",
+        "//pkg/fdnotifier",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/host",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
@@ -67,11 +69,13 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 7f2181216..40933b74b 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -760,7 +760,7 @@ afterTrailingSymlink:
 			parent.dirMu.Unlock()
 			return nil, syserror.EPERM
 		}
-		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
 		parent.dirMu.Unlock()
 		return fd, err
 	}
@@ -801,6 +801,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 				return nil, err
 			}
 			fd := &regularFileFD{}
+			fd.LockFD.Init(&d.locks)
 			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
 				AllowDirectIO: true,
 			}); err != nil {
@@ -826,6 +827,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 			}
 		}
 		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -842,7 +844,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
 		}
 	case linux.S_IFIFO:
 		if d.isSynthetic() {
-			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
+			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
 		}
 	}
 	return d.openSpecialFileLocked(ctx, mnt, opts)
@@ -873,19 +875,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 	if opts.Flags&linux.O_DIRECT != 0 {
 		return nil, syserror.EINVAL
 	}
-	h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
+	// We assume that the server silently inserts O_NONBLOCK in the open flags
+	// for all named pipes (because all existing gofers do this).
+	//
+	// NOTE(b/133875563): This makes named pipe opens racy, because the
+	// mechanisms for translating nonblocking to blocking opens can only detect
+	// the instantaneous presence of a peer holding the other end of the pipe
+	// open, not whether the pipe was *previously* opened by a peer that has
+	// since closed its end.
+	isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+	h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
 	if err != nil {
+		if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+			// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+			// with ENXIO if opening the same named pipe with O_WRONLY would
+			// block because there are no readers of the pipe.
+			if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+				return nil, err
+			}
+			goto retry
+		}
 		return nil, err
 	}
-	seekable := d.fileType() == linux.S_IFREG
-	fd := &specialFileFD{
-		handle:   h,
-		seekable: seekable,
+	if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+		if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
-		DenyPRead:  !seekable,
-		DenyPWrite: !seekable,
-	}); err != nil {
+	fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+	if err != nil {
 		h.close(ctx)
 		return nil, err
 	}
@@ -894,7 +914,7 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
 
 // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
 // !d.isSynthetic().
-func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
@@ -947,6 +967,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		return nil, err
 	}
+	*ds = appendDentry(*ds, child)
 	// Incorporate the fid that was opened by lcreate.
 	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
 	if useRegularFileFD {
@@ -959,10 +980,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
 		child.handleMu.Unlock()
 	}
-	// Take a reference on the new dentry to be held by the new file
-	// description. (This reference also means that the new dentry is not
-	// eligible for caching yet, so we don't need to append to a dentry slice.)
-	child.refs = 1
 	// Insert the dentry into the tree.
 	d.cacheNewChildLocked(child, name)
 	if d.cachedMetadataAuthoritative() {
@@ -974,6 +991,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 	var childVFSFD *vfs.FileDescription
 	if useRegularFileFD {
 		fd := &regularFileFD{}
+		fd.LockFD.Init(&child.locks)
 		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
 			AllowDirectIO: true,
 		}); err != nil {
@@ -981,22 +999,16 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
 		}
 		childVFSFD = &fd.vfsfd
 	} else {
-		seekable := child.fileType() == linux.S_IFREG
-		fd := &specialFileFD{
-			handle: handle{
-				file: openFile,
-				fd:   -1,
-			},
-			seekable: seekable,
+		h := handle{
+			file: openFile,
+			fd:   -1,
 		}
 		if fdobj != nil {
-			fd.handle.fd = int32(fdobj.Release())
+			h.fd = int32(fdobj.Release())
 		}
-		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
-			DenyPRead:  !seekable,
-			DenyPWrite: !seekable,
-		}); err != nil {
-			fd.handle.close(ctx)
+		fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+		if err != nil {
+			h.close(ctx)
 			return nil, err
 		}
 		childVFSFD = &fd.vfsfd
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index ebf063a58..0d88a328e 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -45,6 +45,7 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -52,6 +53,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -84,12 +86,6 @@ type filesystem struct {
 	// devMinor is the filesystem's minor device number. devMinor is immutable.
 	devMinor uint32
 
-	// uid and gid are the effective KUID and KGID of the filesystem's creator,
-	// and are used as the owner and group for files that don't specify one.
-	// uid and gid are immutable.
-	uid auth.KUID
-	gid auth.KGID
-
 	// renameMu serves two purposes:
 	//
 	// - It synchronizes path resolution with renaming initiated by this
@@ -122,6 +118,8 @@ type filesystemOptions struct {
 	fd      int
 	aname   string
 	interop InteropMode // derived from the "cache" mount option
+	dfltuid auth.KUID
+	dfltgid auth.KGID
 	msize   uint32
 	version string
 
@@ -230,6 +228,15 @@ type InternalFilesystemOptions struct {
 	OpenSocketsByConnecting bool
 }
 
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+	// uint32(-2) doesn't work in Go.
+	_V9FS_DEFUID = auth.KUID(4294967294)
+	_V9FS_DEFGID = auth.KGID(4294967294)
+)
+
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
@@ -315,6 +322,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	// Parse the default UID and GID.
+	fsopts.dfltuid = _V9FS_DEFUID
+	if dfltuidstr, ok := mopts["dfltuid"]; ok {
+		delete(mopts, "dfltuid")
+		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+		// in the caller's user namespace, but goferfs isn't
+		// application-mountable.
+		fsopts.dfltuid = auth.KUID(dfltuid)
+	}
+	fsopts.dfltgid = _V9FS_DEFGID
+	if dfltgidstr, ok := mopts["dfltgid"]; ok {
+		delete(mopts, "dfltgid")
+		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.dfltgid = auth.KGID(dfltgid)
+	}
+
 	// Parse the 9P message size.
 	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
 	if msizestr, ok := mopts["msize"]; ok {
@@ -422,8 +454,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		client:           client,
 		clock:            ktime.RealtimeClockFromContext(ctx),
 		devMinor:         devMinor,
-		uid:              creds.EffectiveKUID,
-		gid:              creds.EffectiveKGID,
 		syncableDentries: make(map[*dentry]struct{}),
 		specialFileFDs:   make(map[*specialFileFD]struct{}),
 	}
@@ -634,6 +664,8 @@ type dentry struct {
 	// If this dentry represents a synthetic named pipe, pipe is the pipe
 	// endpoint bound to this file.
 	pipe *pipe.VFSPipe
+
+	locks lock.FileLocks
 }
 
 // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
@@ -672,8 +704,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 		file:      file,
 		ino:       qid.Path,
 		mode:      uint32(attr.Mode),
-		uid:       uint32(fs.uid),
-		gid:       uint32(fs.gid),
+		uid:       uint32(fs.opts.dfltuid),
+		gid:       uint32(fs.opts.dfltgid),
 		blockSize: usermem.PageSize,
 		handle: handle{
 			fd: -1,
@@ -928,8 +960,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
 		// so we can't race with Write or another truncate.)
 		d.dataMu.Unlock()
 		if d.size < oldSize {
-			oldpgend := pageRoundUp(oldSize)
-			newpgend := pageRoundUp(d.size)
+			oldpgend, _ := usermem.PageRoundUp(oldSize)
+			newpgend, _ := usermem.PageRoundUp(d.size)
 			if oldpgend != newpgend {
 				d.mapsMu.Lock()
 				d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
@@ -1011,6 +1043,18 @@ func (d *dentry) decRefLocked() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
+}
+
 // checkCachingLocked should be called after d's reference count becomes 0 or it
 // becomes disowned.
 //
@@ -1326,6 +1370,9 @@ func (d *dentry) decLinks() {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	lockLogging sync.Once
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1376,3 +1423,19 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
 	return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name)
 }
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("File lock using gofer file handled internally.")
+	})
+	return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("Range lock using gofer file handled internally.")
+	})
+	return fd.LockFD.LockPOSIX(ctx, uid, t, rng, block)
+}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+	tempPipeMu      sync.Mutex
+	tempPipeReadFD  int
+	tempPipeWriteFD int
+	tempPipeBuf     [1]byte
+)
+
+func init() {
+	var pipeFDs [2]int
+	if err := unix.Pipe(pipeFDs[:]); err != nil {
+		panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+	}
+	tempPipeReadFD = pipeFDs[0]
+	tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+	for {
+		ok, err := nonblockingPipeHasWriter(fd)
+		if err != nil {
+			return err
+		}
+		if ok {
+			return nil
+		}
+		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+			return err
+		}
+	}
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+	tempPipeMu.Lock()
+	defer tempPipeMu.Unlock()
+	// Copy 1 byte from fd into the temporary pipe.
+	n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+	if err == syserror.EAGAIN {
+		// The pipe represented by fd is empty, but has a writer.
+		return true, nil
+	}
+	if err != nil {
+		return false, err
+	}
+	if n == 0 {
+		// The pipe represented by fd is empty and has no writer.
+		return false, nil
+	}
+	// The pipe represented by fd is non-empty, so it either has, or has
+	// previously had, a writer. Remove the byte copied to the temporary pipe
+	// before returning.
+	if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+		panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+	}
+	return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+	t := time.NewTimer(100 * time.Millisecond)
+	defer t.Stop()
+	cancel := ctx.SleepStart()
+	select {
+	case <-t.C:
+		ctx.SleepFinish(true)
+		return nil
+	case <-cancel:
+		ctx.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 857f7c74e..0d10cf7ac 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 			return 0, err
 		}
 		// Remove touched pages from the cache.
-		pgstart := pageRoundDown(uint64(offset))
-		pgend := pageRoundUp(uint64(offset + src.NumBytes()))
-		if pgend < pgstart {
+		pgstart := usermem.PageRoundDown(uint64(offset))
+		pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+		if !ok {
 			return 0, syserror.EINVAL
 		}
 		mr := memmap.MappableRange{pgstart, pgend}
@@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 			if fillCache {
 				// Read into the cache, then re-enter the loop to read from the
 				// cache.
+				gapEnd, _ := usermem.PageRoundUp(gapMR.End)
 				reqMR := memmap.MappableRange{
-					Start: pageRoundDown(gapMR.Start),
-					End:   pageRoundUp(gapMR.End),
+					Start: usermem.PageRoundDown(gapMR.Start),
+					End:   gapEnd,
 				}
 				optMR := gap.Range()
 				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
@@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
 
 	// Constrain translations to d.size (rounded up) to prevent translation to
 	// pages that may be concurrently truncated.
-	pgend := pageRoundUp(d.size)
+	pgend, _ := usermem.PageRoundUp(d.size)
 	var beyondEOF bool
 	if required.End > pgend {
 		if required.Start >= pgend {
@@ -818,43 +819,15 @@ type dentryPlatformFile struct {
 // IncRef implements platform.File.IncRef.
 func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
 	d.dataMu.Lock()
-	seg, gap := d.fdRefs.Find(fr.Start)
-	for {
-		switch {
-		case seg.Ok() && seg.Start() < fr.End:
-			seg = d.fdRefs.Isolate(seg, fr)
-			seg.SetValue(seg.Value() + 1)
-			seg, gap = seg.NextNonEmpty()
-		case gap.Ok() && gap.Start() < fr.End:
-			newRange := gap.Range().Intersect(fr)
-			usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
-			seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
-		default:
-			d.fdRefs.MergeAdjacent(fr)
-			d.dataMu.Unlock()
-			return
-		}
-	}
+	d.fdRefs.IncRefAndAccount(fr)
+	d.dataMu.Unlock()
 }
 
 // DecRef implements platform.File.DecRef.
 func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
 	d.dataMu.Lock()
-	seg := d.fdRefs.FindSegment(fr.Start)
-
-	for seg.Ok() && seg.Start() < fr.End {
-		seg = d.fdRefs.Isolate(seg, fr)
-		if old := seg.Value(); old == 1 {
-			usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
-			seg = d.fdRefs.Remove(seg).NextSegment()
-		} else {
-			seg.SetValue(old - 1)
-			seg = seg.NextSegment()
-		}
-	}
-	d.fdRefs.MergeAdjacent(fr)
+	d.fdRefs.DecRefAndAccount(fr)
 	d.dataMu.Unlock()
-
 }
 
 // MapInternal implements platform.File.MapInternal.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index a464e6a94..289efdd25 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -19,17 +19,19 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// specialFileFD implements vfs.FileDescriptionImpl for files other than
-// regular files, directories, and symlinks: pipes, sockets, etc. It is also
-// used for regular files when filesystemOptions.specialRegularFiles is in
-// effect. specialFileFD differs from regularFileFD by using per-FD handles
-// instead of shared per-dentry handles, and never buffering I/O.
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.specialRegularFiles is in effect)
+// regular files. specialFileFD differs from regularFileFD by using per-FD
+// handles instead of shared per-dentry handles, and never buffering I/O.
 type specialFileFD struct {
 	fileDescription
 
@@ -40,13 +42,48 @@ type specialFileFD struct {
 	// file offset is significant, i.e. a regular file. seekable is immutable.
 	seekable bool
 
+	// mayBlock is true if this file description represents a file for which
+	// queue may send I/O readiness events. mayBlock is immutable.
+	mayBlock bool
+	queue    waiter.Queue
+
 	// If seekable is true, off is the file offset. off is protected by mu.
 	mu  sync.Mutex
 	off int64
 }
 
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *lock.FileLocks, flags uint32) (*specialFileFD, error) {
+	ftype := d.fileType()
+	seekable := ftype == linux.S_IFREG
+	mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK
+	fd := &specialFileFD{
+		handle:   h,
+		seekable: seekable,
+		mayBlock: mayBlock,
+	}
+	fd.LockFD.Init(locks)
+	if mayBlock && h.fd >= 0 {
+		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+			return nil, err
+		}
+	}
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  !seekable,
+		DenyPWrite: !seekable,
+	}); err != nil {
+		if mayBlock && h.fd >= 0 {
+			fdnotifier.RemoveFD(h.fd)
+		}
+		return nil, err
+	}
+	return fd, nil
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
 func (fd *specialFileFD) Release() {
+	if fd.mayBlock && fd.handle.fd >= 0 {
+		fdnotifier.RemoveFD(fd.handle.fd)
+	}
 	fd.handle.close(context.Background())
 	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 	fs.syncMu.Lock()
@@ -62,6 +99,32 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
 	return fd.handle.file.flush(ctx)
 }
 
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if fd.mayBlock {
+		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+	}
+	return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	if fd.mayBlock {
+		fd.queue.EventRegister(e, mask)
+		return
+	}
+	fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+	if fd.mayBlock {
+		fd.queue.EventUnregister(e)
+		return
+	}
+	fd.fileDescription.EventUnregister(e)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
@@ -81,6 +144,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 	}
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	if n == 0 {
 		return 0, err
 	}
@@ -130,6 +196,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 		return 0, err
 	}
 	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
 	return int64(n), err
 }
 
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 2608e7e1d..1d5aa82dc 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -38,6 +38,9 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
 
 // Preconditions: fs.interop != InteropModeShared.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 39509f703..54f16ad63 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -8,6 +8,7 @@ go_library(
         "control.go",
         "host.go",
         "ioctl_unsafe.go",
+        "mmap.go",
         "socket.go",
         "socket_iovec.go",
         "socket_unsafe.go",
@@ -23,18 +24,22 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/hostfd",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
+        "//pkg/sentry/platform",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/uniqueid",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 8caf55a1b..5ec5100b8 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -34,6 +34,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -86,15 +87,13 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 
 	i := &inode{
 		hostFD:     hostFD,
-		seekable:   seekable,
+		ino:        fs.NextIno(),
 		isTTY:      opts.IsTTY,
-		canMap:     canMap(uint32(fileType)),
 		wouldBlock: wouldBlock(uint32(fileType)),
-		ino:        fs.NextIno(),
-		// For simplicity, set offset to 0. Technically, we should use the existing
-		// offset on the host if the file is seekable.
-		offset: 0,
+		seekable:   seekable,
+		canMap:     canMap(uint32(fileType)),
 	}
+	i.pf.inode = i
 
 	// Non-seekable files can't be memory mapped, assert this.
 	if !i.seekable && i.canMap {
@@ -117,6 +116,10 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 
 	// i.open will take a reference on d.
 	defer d.DecRef()
+
+	// For simplicity, fileDescription.offset is set to 0. Technically, we
+	// should only set to 0 on files that are not seekable (sockets, pipes,
+	// etc.), and use the offset from the host fd otherwise when importing.
 	return i.open(ctx, d.VFSDentry(), mnt, flags)
 }
 
@@ -180,6 +183,8 @@ type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
 
+	locks lock.FileLocks
+
 	// When the reference count reaches zero, the host fd is closed.
 	refs.AtomicRefCount
 
@@ -189,11 +194,15 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	hostFD int
 
-	// wouldBlock is true if the host FD would return EWOULDBLOCK for
-	// operations that would block.
+	// ino is an inode number unique within this filesystem.
 	//
 	// This field is initialized at creation time and is immutable.
-	wouldBlock bool
+	ino uint64
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
 
 	// seekable is false if the host fd points to a file representing a stream,
 	// e.g. a socket or a pipe. Such files are not seekable and can return
@@ -202,29 +211,29 @@ type inode struct {
 	// This field is initialized at creation time and is immutable.
 	seekable bool
 
-	// isTTY is true if this file represents a TTY.
+	// wouldBlock is true if the host FD would return EWOULDBLOCK for
+	// operations that would block.
 	//
 	// This field is initialized at creation time and is immutable.
-	isTTY bool
+	wouldBlock bool
+
+	// Event queue for blocking operations.
+	queue waiter.Queue
 
 	// canMap specifies whether we allow the file to be memory mapped.
 	//
 	// This field is initialized at creation time and is immutable.
 	canMap bool
 
-	// ino is an inode number unique within this filesystem.
-	//
-	// This field is initialized at creation time and is immutable.
-	ino uint64
-
-	// offsetMu protects offset.
-	offsetMu sync.Mutex
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex
 
-	// offset specifies the current file offset.
-	offset int64
+	// If canMap is true, mappings tracks mappings of hostFD into
+	// memmap.MappingSpaces.
+	mappings memmap.MappingSet
 
-	// Event queue for blocking operations.
-	queue waiter.Queue
+	// pf implements platform.File for mappings of hostFD.
+	pf inodePlatformFile
 }
 
 // CheckPermissions implements kernfs.Inode.
@@ -388,6 +397,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
 			return err
 		}
+		oldSize := uint64(hostStat.Size)
+		if s.Size < oldSize {
+			oldpgend, _ := usermem.PageRoundUp(oldSize)
+			newpgend, _ := usermem.PageRoundUp(s.Size)
+			if oldpgend != newpgend {
+				i.mapsMu.Lock()
+				i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				i.mapsMu.Unlock()
+			}
+		}
 	}
 	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
 		ts := [2]syscall.Timespec{
@@ -447,7 +471,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d)
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
 	}
 
 	// TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that
@@ -457,6 +481,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			fileDescription: fileDescription{inode: i},
 			termios:         linux.DefaultSlaveTermios,
 		}
+		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
 		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
@@ -464,10 +489,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 		return vfsfd, nil
 	}
 
-	// For simplicity, set offset to 0. Technically, we should
-	// only set to 0 on files that are not seekable (sockets, pipes, etc.),
-	// and use the offset from the host fd otherwise.
 	fd := &fileDescription{inode: i}
+	fd.LockFD.Init(&i.locks)
 	vfsfd := &fd.vfsfd
 	if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
@@ -479,6 +502,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
 	// cached to reduce indirections and casting. fileDescription does not hold
@@ -487,6 +511,13 @@ type fileDescription struct {
 	//
 	// inode is immutable after fileDescription creation.
 	inode *inode
+
+	// offsetMu protects offset.
+	offsetMu sync.Mutex
+
+	// offset specifies the current file offset. It is only meaningful when
+	// inode.seekable is true.
+	offset int64
 }
 
 // SetStat implements vfs.FileDescriptionImpl.
@@ -532,10 +563,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 		return n, err
 	}
 	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
-	i.offsetMu.Lock()
-	n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags)
-	i.offset += n
-	i.offsetMu.Unlock()
+	f.offsetMu.Lock()
+	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
 	return n, err
 }
 
@@ -572,10 +603,10 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt
 	}
 	// TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so.
 	// TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file.
-	i.offsetMu.Lock()
-	n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags)
-	i.offset += n
-	i.offsetMu.Unlock()
+	f.offsetMu.Lock()
+	n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
 	return n, err
 }
 
@@ -600,41 +631,41 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 		return 0, syserror.ESPIPE
 	}
 
-	i.offsetMu.Lock()
-	defer i.offsetMu.Unlock()
+	f.offsetMu.Lock()
+	defer f.offsetMu.Unlock()
 
 	switch whence {
 	case linux.SEEK_SET:
 		if offset < 0 {
-			return i.offset, syserror.EINVAL
+			return f.offset, syserror.EINVAL
 		}
-		i.offset = offset
+		f.offset = offset
 
 	case linux.SEEK_CUR:
-		// Check for overflow. Note that underflow cannot occur, since i.offset >= 0.
-		if offset > math.MaxInt64-i.offset {
-			return i.offset, syserror.EOVERFLOW
+		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
+		if offset > math.MaxInt64-f.offset {
+			return f.offset, syserror.EOVERFLOW
 		}
-		if i.offset+offset < 0 {
-			return i.offset, syserror.EINVAL
+		if f.offset+offset < 0 {
+			return f.offset, syserror.EINVAL
 		}
-		i.offset += offset
+		f.offset += offset
 
 	case linux.SEEK_END:
 		var s syscall.Stat_t
 		if err := syscall.Fstat(i.hostFD, &s); err != nil {
-			return i.offset, err
+			return f.offset, err
 		}
 		size := s.Size
 
 		// Check for overflow. Note that underflow cannot occur, since size >= 0.
 		if offset > math.MaxInt64-size {
-			return i.offset, syserror.EOVERFLOW
+			return f.offset, syserror.EOVERFLOW
 		}
 		if size+offset < 0 {
-			return i.offset, syserror.EINVAL
+			return f.offset, syserror.EINVAL
 		}
-		i.offset = size + offset
+		f.offset = size + offset
 
 	case linux.SEEK_DATA, linux.SEEK_HOLE:
 		// Modifying the offset in the host file table should not matter, since
@@ -643,16 +674,16 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 		// For reading and writing, we always rely on our internal offset.
 		n, err := unix.Seek(i.hostFD, offset, int(whence))
 		if err != nil {
-			return i.offset, err
+			return f.offset, err
 		}
-		i.offset = n
+		f.offset = n
 
 	default:
 		// Invalid whence.
-		return i.offset, syserror.EINVAL
+		return f.offset, syserror.EINVAL
 	}
 
-	return i.offset, nil
+	return f.offset, nil
 }
 
 // Sync implements FileDescriptionImpl.
@@ -666,8 +697,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
 	if !f.inode.canMap {
 		return syserror.ENODEV
 	}
-	// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
-	return syserror.ENODEV
+	i := f.inode
+	i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init)
+	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
new file mode 100644
index 000000000..8545a82f0
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// inodePlatformFile implements platform.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
+//
+// inodePlatformFile should only be used if inode.canMap is true.
+type inodePlatformFile struct {
+	*inode
+
+	// fdRefsMu protects fdRefs.
+	fdRefsMu sync.Mutex
+
+	// fdRefs counts references on platform.File offsets. It is used solely for
+	// memory accounting.
+	fdRefs fsutil.FrameRefSet
+
+	// fileMapper caches mappings of the host file represented by this inode.
+	fileMapper fsutil.HostFileMapper
+
+	// fileMapperInitOnce is used to lazily initialize fileMapper.
+	fileMapperInitOnce sync.Once
+}
+
+// IncRef implements platform.File.IncRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.IncRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// DecRef implements platform.File.DecRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.DecRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// MapInternal implements platform.File.MapInternal.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
+}
+
+// FD implements platform.File.FD.
+func (i *inodePlatformFile) FD() int {
+	return i.hostFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	i.mapsMu.Lock()
+	mapped := i.mappings.AddMapping(ms, ar, offset, writable)
+	for _, r := range mapped {
+		i.pf.fileMapper.IncRefOn(r)
+	}
+	i.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	i.mapsMu.Lock()
+	unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		i.pf.fileMapper.DecRefOn(r)
+	}
+	i.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return i.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	mr := optional
+	return []memmap.Translation{
+		{
+			Source: mr,
+			File:   &i.pf,
+			Offset: mr.Start,
+			Perms:  usermem.AnyAccess,
+		},
+	}, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) InvalidateUnsavable(ctx context.Context) error {
+	// We expect the same host fd across save/restore, so all translations
+	// should be valid.
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index ef34cb28a..0299dbde9 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -49,6 +49,7 @@ go_library(
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
@@ -67,6 +68,7 @@ go_test(
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserror",
         "//pkg/usermem",
         "@com_github_google_go-cmp//cmp:go_default_library",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 1568a9d49..6418de0a3 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -38,7 +39,8 @@ type DynamicBytesFile struct {
 	InodeNotDirectory
 	InodeNotSymlink
 
-	data vfs.DynamicBytesSource
+	locks lock.FileLocks
+	data  vfs.DynamicBytesSource
 }
 
 var _ Inode = (*DynamicBytesFile)(nil)
@@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
 // Open implements Inode.Open.
 func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
 type DynamicBytesFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode Inode
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *lock.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
 	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8284e76a7..33a5968ca 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -42,6 +43,7 @@ import (
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
@@ -55,9 +57,9 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, opts); err != nil {
+	if err := fd.Init(children, locks, opts); err != nil {
 		return nil, err
 	}
 	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
@@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
+	fd.LockFD.Init(locks)
 	fd.children = children
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 982daa2e6..0e4927215 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -555,6 +556,8 @@ type StaticDirectory struct {
 	InodeAttrs
 	InodeNoDynamicLookup
 	OrderedChildren
+
+	locks lock.FileLocks
 }
 
 var _ Inode = (*StaticDirectory)(nil)
@@ -584,7 +587,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
 
 // Open implements kernfs.Inode.
 func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..bbee8ccda 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -225,9 +225,21 @@ func (d *Dentry) destroy() {
 	}
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
 // InsertChild inserts child into the vfs dentry cache with the given name under
 // this dentry. This does not update the directory inode, so calling this on
-// it's own isn't sufficient to insert a child into a directory. InsertChild
+// its own isn't sufficient to insert a child into a directory. InsertChild
 // updates the link count on d if required.
 //
 // Precondition: d must represent a directory inode.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 412cf6ac9..6749facf7 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -100,8 +101,10 @@ type readonlyDir struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks lock.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -117,7 +120,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
 }
 
 func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -128,10 +131,12 @@ type dir struct {
 	attrs
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoDynamicLookup
+	kernfs.OrderedChildren
+
+	locks lock.FileLocks
 
 	fs     *filesystem
 	dentry kernfs.Dentry
-	kernfs.OrderedChildren
 }
 
 func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
@@ -147,7 +152,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
 }
 
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD
index 5950a2d59..c618dbe6c 100644
--- a/pkg/sentry/fsimpl/pipefs/BUILD
+++ b/pkg/sentry/fsimpl/pipefs/BUILD
@@ -15,6 +15,7 @@ go_library(
         "//pkg/sentry/kernel/pipe",
         "//pkg/sentry/kernel/time",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserror",
         "//pkg/usermem",
     ],
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index cab771211..e4dabaa33 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -81,7 +82,8 @@ type inode struct {
 	kernfs.InodeNotSymlink
 	kernfs.InodeNoopRefCount
 
-	pipe *pipe.VFSPipe
+	locks lock.FileLocks
+	pipe  *pipe.VFSPipe
 
 	ino uint64
 	uid auth.KUID
@@ -147,7 +149,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 
 // Open implements kernfs.Inode.Open.
 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags)
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 17c1342b5..351ba4ee9 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -35,6 +35,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserror",
         "//pkg/tcpip/header",
         "//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 36a911db4..e2cdb7ee9 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -24,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -37,6 +38,8 @@ type subtasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks lock.FileLocks
+
 	fs                *filesystem
 	task              *kernel.Task
 	pidns             *kernel.PIDNamespace
@@ -153,7 +156,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 // Open implements kernfs.Inode.
 func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
 		return nil, err
 	}
 	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 482055db1..44078a765 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -38,6 +39,8 @@ type taskInode struct {
 	kernfs.InodeAttrs
 	kernfs.OrderedChildren
 
+	locks lock.FileLocks
+
 	task *kernel.Task
 }
 
@@ -103,7 +106,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
 
 // Open implements kernfs.Inode.
 func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 44ccc9e4a..ef6c1d04f 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -53,6 +54,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool {
 }
 
 type fdDir struct {
+	locks lock.FileLocks
+
 	fs   *filesystem
 	task *kernel.Task
 
@@ -143,7 +146,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 
 // Open implements kernfs.Inode.
 func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
@@ -270,7 +273,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 
 // Open implements kernfs.Inode.
 func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 2f297e48a..e5eaa91cd 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -30,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -775,6 +776,8 @@ type namespaceInode struct {
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
+
+	locks lock.FileLocks
 }
 
 var _ kernfs.Inode = (*namespaceInode)(nil)
@@ -791,6 +794,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
+	fd.LockFD.Init(&i.locks)
 	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
@@ -801,6 +805,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 // /proc/[pid]/ns/*.
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	vfsfd vfs.FileDescription
 	inode *namespaceInode
@@ -825,8 +830,3 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err
 func (fd *namespaceFD) Release() {
 	fd.inode.DecRef()
 }
-
-// OnClose implements FileDescriptionImpl.
-func (*namespaceFD) OnClose(context.Context) error {
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b51d43954..58c8b9d05 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -43,6 +44,8 @@ type tasksInode struct {
 	kernfs.OrderedChildren
 	kernfs.AlwaysValid
 
+	locks lock.FileLocks
+
 	fs    *filesystem
 	pidns *kernel.PIDNamespace
 
@@ -197,7 +200,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 
 // Open implements kernfs.Inode.
 func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index d29ef3f83..242ba9b5d 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -31,6 +31,7 @@ type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	// target is the original signal target task.
 	//
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index a741e2bb6..237f17def 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -15,6 +15,7 @@ go_library(
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserror",
     ],
 )
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0af373604..b84463d3a 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -98,8 +99,10 @@ type dir struct {
 	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
-
 	kernfs.OrderedChildren
+
+	locks lock.FileLocks
+
 	dentry kernfs.Dentry
 }
 
@@ -121,7 +124,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set
 
 // Open implements kernfs.Inode.Open.
 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 60c92d626..2dc90d484 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -32,6 +32,7 @@ type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
 
 	events waiter.Queue
 	timer  *ktime.Timer
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 007be1572..062321cbc 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -59,6 +59,7 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sentry/vfs/lock",
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index 83bf885ee..ac54d420d 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -29,7 +29,7 @@ type deviceFile struct {
 	minor uint32
 }
 
-func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
 	file := &deviceFile{
 		kind:  kind,
 		major: major,
@@ -43,7 +43,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode
 	default:
 		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
 	}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..913b8a6c5 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -48,9 +48,9 @@ type directory struct {
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
+	dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
 	dir.dentry.inode = &dir.inode
 	dir.dentry.vfsd.Init(&dir.dentry)
@@ -79,6 +79,7 @@ func (dir *directory) removeChildLocked(child *dentry) {
 	dir.iterMu.Lock()
 	dir.childList.Remove(child)
 	dir.iterMu.Unlock()
+	child.unlinked = true
 }
 
 type directoryFD struct {
@@ -112,6 +113,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	dir.iterMu.Lock()
 	defer dir.iterMu.Unlock()
 
+	fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
 	fd.inode().touchAtime(fd.vfsfd.Mount())
 
 	if fd.off == 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 80fa7b29d..72399b321 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
 	if err := create(parentDir, name); err != nil {
 		return err
 	}
+
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent)
 	parentDir.inode.touchCMtime()
 	return nil
 }
@@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			return syserror.EMLINK
 		}
 		d.inode.incLinksLocked()
+		d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent)
 		parentDir.insertChildLocked(fs.newDentry(d.inode), name)
 		return nil
 	})
@@ -249,11 +256,12 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		if parentDir.inode.nlink == maxLinks {
 			return syserror.EMLINK
 		}
 		parentDir.inode.incLinksLocked() // from child's ".."
-		childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		parentDir.insertChildLocked(&childDir.dentry, name)
 		return nil
 	})
@@ -262,18 +270,19 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
 		var childInode *inode
 		switch opts.Mode.FileType() {
 		case 0, linux.S_IFREG:
-			childInode = fs.newRegularFile(rp.Credentials(), opts.Mode)
+			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFIFO:
-			childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode)
+			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
 		case linux.S_IFBLK:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFCHR:
-			childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
 		case linux.S_IFSOCK:
-			childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint)
+			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
 		default:
 			return syserror.EINVAL
 		}
@@ -348,12 +357,14 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
 		parentDir.insertChildLocked(child, name)
 		fd, err := child.open(ctx, rp, &opts, true)
 		if err != nil {
 			return nil, err
 		}
+		parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent)
 		parentDir.inode.touchCMtime()
 		return fd, nil
 	}
@@ -388,6 +399,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
+		fd.LockFD.Init(&d.inode.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
@@ -403,15 +415,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
+		fd.LockFD.Init(&d.inode.locks)
 		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
+		// TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags)
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
 	case *deviceFile:
 		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
 	case *socketFile:
@@ -559,6 +572,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		newParentDir.inode.touchCMtime()
 	}
 	renamed.inode.touchCtime()
+
+	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
 	return nil
 }
 
@@ -603,8 +618,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	parentDir.removeChildLocked(child)
-	parentDir.inode.decLinksLocked() // from child's ".."
+	parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent)
+	// Remove links for child, child/., and child/..
 	child.inode.decLinksLocked()
+	child.inode.decLinksLocked()
+	parentDir.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
 	parentDir.inode.touchCMtime()
 	return nil
@@ -618,7 +636,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 	if err != nil {
 		return err
 	}
-	return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+	if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
@@ -656,7 +681,8 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
 	return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
 		parentDir.insertChildLocked(child, name)
 		return nil
 	})
@@ -698,6 +724,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 		return err
 	}
+
+	// Generate inotify events. Note that this must take place before the link
+	// count of the child is decremented, or else the watches may be dropped
+	// before these events are added.
+	vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
 	parentDir.removeChildLocked(child)
 	child.inode.decLinksLocked()
 	vfsObj.CommitDeleteDentry(&child.vfsd)
@@ -754,7 +786,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return err
 	}
-	return d.inode.setxattr(rp.Credentials(), &opts)
+	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -765,7 +802,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 	if err != nil {
 		return err
 	}
-	return d.inode.removexattr(rp.Credentials(), name)
+	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 8d77b3fa8..739350cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -30,9 +30,9 @@ type namedPipe struct {
 // Preconditions:
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, linux.S_IFIFO|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 3f433d666..77447b32c 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -85,12 +84,12 @@ type regularFile struct {
 	size uint64
 }
 
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
 		memFile: fs.memFile,
 		seals:   linux.F_SEAL_SEAL,
 	}
-	file.inode.init(file, fs, creds, linux.S_IFREG|mode)
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
@@ -312,7 +311,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
 	f := fd.inode().impl.(*regularFile)
 	if end := offset + srclen; end < offset {
 		// Overflow.
-		return 0, syserror.EFBIG
+		return 0, syserror.EINVAL
 	}
 
 	var err error
@@ -366,28 +365,6 @@ func (fd *regularFileFD) Sync(ctx context.Context) error {
 	return nil
 }
 
-// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
-func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
-	return fd.inode().lockBSD(uid, t, block)
-}
-
-// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
-func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
-	fd.inode().unlockBSD(uid)
-	return nil
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
-	return fd.inode().lockPOSIX(uid, t, rng, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
-	fd.inode().unlockPOSIX(uid, rng)
-	return nil
-}
-
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
 	file := fd.inode().impl.(*regularFile)
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 25c2321af..3ed650474 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -26,9 +26,9 @@ type socketFile struct {
 	ep    transport.BoundEndpoint
 }
 
-func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
 	file := &socketFile{ep: ep}
-	file.inode.init(file, fs, creds, mode)
+	file.inode.init(file, fs, kuid, kgid, mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index 47e075ed4..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -24,11 +24,11 @@ type symlink struct {
 	target string // immutable
 }
 
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, linux.S_IFLNK|0777)
+	link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 1e781aecd..71a7522af 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -30,12 +30,12 @@ package tmpfs
 import (
 	"fmt"
 	"math"
+	"strconv"
 	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -112,6 +112,58 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		}
 	}
 
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	rootMode := linux.FileMode(0777)
+	if rootFileType == linux.S_IFDIR {
+		rootMode = 01777
+	}
+	modeStr, ok := mopts["mode"]
+	if ok {
+		delete(mopts, "mode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode & 07777)
+	}
+	rootKUID := creds.EffectiveKUID
+	uidStr, ok := mopts["uid"]
+	if ok {
+		delete(mopts, "uid")
+		uid, err := strconv.ParseUint(uidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+		if !kuid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKUID = kuid
+	}
+	rootKGID := creds.EffectiveKGID
+	gidStr, ok := mopts["gid"]
+	if ok {
+		delete(mopts, "gid")
+		gid, err := strconv.ParseUint(gidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+		if !kgid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKGID = kgid
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -127,11 +179,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	var root *dentry
 	switch rootFileType {
 	case linux.S_IFREG:
-		root = fs.newDentry(fs.newRegularFile(creds, 0777))
+		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
 	case linux.S_IFLNK:
-		root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
+		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
 	case linux.S_IFDIR:
-		root = &fs.newDirectory(creds, 01777).dentry
+		root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
 	default:
 		fs.vfsfs.DecRef()
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
@@ -163,6 +215,11 @@ type dentry struct {
 	// filesystem.mu.
 	name string
 
+	// unlinked indicates whether this dentry has been unlinked from its parent.
+	// It is only set to true on an unlink operation, and never set from true to
+	// false. unlinked is protected by filesystem.mu.
+	unlinked bool
+
 	// dentryEntry (ugh) links dentries into their parent directory.childList.
 	dentryEntry
 
@@ -201,6 +258,26 @@ func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {
+	if d.inode.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		// Note that d.parent or d.name may be stale if there is a concurrent
+		// rename operation. Inotify does not provide consistency guarantees.
+		d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked)
+	}
+	d.inode.watches.Notify("", events, cookie, et)
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.inode.watches
+}
+
 // inode represents a filesystem object.
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
@@ -209,11 +286,9 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory
 	// operations.
 	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
+	// A reference is held on all inodes as long as they are reachable in the
+	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+	// nlink reaches 0.
 	refs int64
 
 	// xattrs implements extended attributes.
@@ -235,23 +310,25 @@ type inode struct {
 	ctime int64 // nanoseconds
 	mtime int64 // nanoseconds
 
-	// Advisory file locks, which lock at the inode level.
 	locks lock.FileLocks
 
+	// Inotify watches for this inode.
+	watches vfs.Watches
+
 	impl interface{} // immutable
 }
 
 const maxLinks = math.MaxUint32
 
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic("file type is required in FileMode")
 	}
 	i.fs = fs
 	i.refs = 1
 	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
+	i.uid = uint32(kuid)
+	i.gid = uint32(kgid)
 	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
 	// Tmpfs creation sets atime, ctime, and mtime to current time.
 	now := fs.clock.Now().Nanoseconds()
@@ -259,6 +336,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
 	i.ctime = now
 	i.mtime = now
 	// i.nlink initialized by caller
+	i.watches = vfs.Watches{}
 	i.impl = impl
 }
 
@@ -276,14 +354,17 @@ func (i *inode) incLinksLocked() {
 	atomic.AddUint32(&i.nlink, 1)
 }
 
-// decLinksLocked decrements i's link count.
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
 //
 // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
 	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
+	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+		i.decRef()
+	}
 }
 
 func (i *inode) incRef() {
@@ -306,6 +387,7 @@ func (i *inode) tryIncRef() bool {
 
 func (i *inode) decRef() {
 	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		i.watches.HandleDeletion()
 		if regFile, ok := i.impl.(*regularFile); ok {
 			// Release memory used by regFile to store data. Since regFile is
 			// no longer usable, we don't need to grab any locks or update any
@@ -455,44 +537,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu
 	return nil
 }
 
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockBSD(uid, t, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockBSD(uid fslock.UniqueID) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockBSD(uid)
-		return nil
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		return i.locks.LockPOSIX(uid, t, rng, block)
-	}
-	return syserror.EBADF
-}
-
-// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular.
-func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error {
-	switch i.impl.(type) {
-	case *regularFile:
-		i.locks.UnlockPOSIX(uid, rng)
-		return nil
-	}
-	return syserror.EBADF
-}
-
 // allocatedBlocksForSize returns the number of 512B blocks needed to
 // accommodate the given size in bytes, as appropriate for struct
 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -531,6 +575,9 @@ func (i *inode) isDir() bool {
 }
 
 func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return
 	}
@@ -621,14 +668,19 @@ func (i *inode) userXattrSupported() bool {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
 	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+	return fd.dentry().inode
 }
 
 // Stat implements vfs.FileDescriptionImpl.Stat.
@@ -641,7 +693,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	return fd.inode().setStat(ctx, creds, &opts.Stat)
+	d := fd.dentry()
+	if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ev, 0, vfs.InodeEvent)
+	}
+	return nil
 }
 
 // Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -656,12 +716,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
 
 // Setxattr implements vfs.FileDescriptionImpl.Setxattr.
 func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
-	return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+	d := fd.dentry()
+	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // Removexattr implements vfs.FileDescriptionImpl.Removexattr.
 func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
-	return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+	d := fd.dentry()
+	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
 }
 
 // NewMemfd creates a new tmpfs regular file and file description that can back
@@ -674,8 +748,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s
 
 	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
 	// S_IRWXUGO.
-	mode := linux.FileMode(0777)
-	inode := fs.newRegularFile(creds, mode)
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
 	rf := inode.impl.(*regularFile)
 	if allowSeals {
 		rf.seals = 0
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
index e057d2c6d..6862f2ef5 100644
--- a/pkg/sentry/kernel/auth/credentials.go
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
 	}
 	return NoID, syserror.EPERM
 }
+
+// SetUID translates the provided uid to the root user namespace and updates c's
+// uids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetUID(uid UID) error {
+	kuid := c.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKUID = kuid
+	c.EffectiveKUID = kuid
+	c.SavedKUID = kuid
+	return nil
+}
+
+// SetGID translates the provided gid to the root user namespace and updates c's
+// gids to it. This performs no permissions or capabilities checks, the caller
+// is responsible for ensuring the calling context is permitted to modify c.
+func (c *Credentials) SetGID(gid GID) error {
+	kgid := c.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	c.RealKGID = kgid
+	c.EffectiveKGID = kgid
+	c.SavedKGID = kgid
+	return nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ed40b5303..b35afafe3 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -80,9 +80,6 @@ type FDTable struct {
 	refs.AtomicRefCount
 	k *Kernel
 
-	// uid is a unique identifier.
-	uid uint64
-
 	// mu protects below.
 	mu sync.Mutex `state:"nosave"`
 
@@ -130,7 +127,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 // drop drops the table reference.
 func (f *FDTable) drop(file *fs.File) {
 	// Release locks.
-	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
+	file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})
 
 	// Send inotify events.
 	d := file.Dirent
@@ -152,23 +149,21 @@ func (f *FDTable) drop(file *fs.File) {
 // dropVFS2 drops the table reference.
 func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
 	// TODO(gvisor.dev/issue/1480): Release locks.
-	// TODO(gvisor.dev/issue/1479): Send inotify events.
+
+	// Generate inotify events.
+	ev := uint32(linux.IN_CLOSE_NOWRITE)
+	if file.IsWritable() {
+		ev = linux.IN_CLOSE_WRITE
+	}
+	file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent)
 
 	// Drop the table reference.
 	file.DecRef()
 }
 
-// ID returns a unique identifier for this FDTable.
-func (f *FDTable) ID() uint64 {
-	return f.uid
-}
-
 // NewFDTable allocates a new FDTable that may be used by tasks in k.
 func (k *Kernel) NewFDTable() *FDTable {
-	f := &FDTable{
-		k:   k,
-		uid: atomic.AddUint64(&k.fdMapUids, 1),
-	}
+	f := &FDTable{k: k}
 	f.init()
 	return f
 }
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5efeb3767..bcbeb6a39 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -194,11 +194,6 @@ type Kernel struct {
 	// cpuClockTickerSetting is protected by runningTasksMu.
 	cpuClockTickerSetting ktime.Setting
 
-	// fdMapUids is an ever-increasing counter for generating FDTable uids.
-	//
-	// fdMapUids is mutable, and is accessed using atomic memory operations.
-	fdMapUids uint64
-
 	// uniqueID is used to generate unique identifiers.
 	//
 	// uniqueID is mutable, and is accessed using atomic memory operations.
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index f29dc0472..0db546b98 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -8,6 +8,7 @@ go_library(
         "device.go",
         "node.go",
         "pipe.go",
+        "pipe_unsafe.go",
         "pipe_util.go",
         "reader.go",
         "reader_writer.go",
@@ -20,11 +21,13 @@ go_library(
         "//pkg/amutex",
         "//pkg/buffer",
         "//pkg/context",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 62c8691f1..79645d7d2 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) {
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.readLocked(ctx, ops)
+}
 
+func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) {
 	// Is the pipe empty?
 	if p.view.Size() == 0 {
 		if !p.HasWriters() {
@@ -246,7 +249,10 @@ type writeOps struct {
 func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
+	return p.writeLocked(ctx, ops)
+}
 
+func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
 	// Can't write to a pipe with no readers.
 	if !p.HasReaders() {
 		return 0, syscall.EPIPE
diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go
index 847cb0784..dd60cba24 100644
--- a/pkg/sentry/fsimpl/gofer/pagemath.go
+++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go
@@ -12,20 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package gofer
+package pipe
 
 import (
-	"gvisor.dev/gvisor/pkg/usermem"
+	"unsafe"
 )
 
-// This are equivalent to usermem.Addr.RoundDown/Up, but without the
-// potentially truncating conversion to usermem.Addr. This is necessary because
-// there is no way to define generic "PageRoundDown/Up" functions in Go.
-
-func pageRoundDown(x uint64) uint64 {
-	return x &^ (usermem.PageSize - 1)
-}
-
-func pageRoundUp(x uint64) uint64 {
-	return pageRoundDown(x + usermem.PageSize - 1)
+// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
+// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
+// concurrent calls cannot deadlock.
+//
+// Preconditions: x != y.
+func lockTwoPipes(x, y *Pipe) {
+	// Lock the two pipes in order of increasing address.
+	if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
+		x.mu.Lock()
+		y.mu.Lock()
+	} else {
+		y.mu.Lock()
+		x.mu.Lock()
+	}
 }
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 5a1d4fd57..aacf28da2 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -144,7 +144,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume
 		if v > math.MaxInt32 {
 			v = math.MaxInt32 // Silently truncate.
 		}
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index b54f08a30..c0e9ee1f4 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -16,9 +16,12 @@ package pipe
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/buffer"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -59,11 +62,13 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
 //
 // Preconditions: statusFlags should not contain an open access mode.
 func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
-	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags)
+	// Connected pipes share the same locks.
+	locks := &lock.FileLocks{}
+	return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
 }
 
 // Open opens the pipe represented by vp.
-func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) {
+func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) (*vfs.FileDescription, error) {
 	vp.mu.Lock()
 	defer vp.mu.Unlock()
 
@@ -73,7 +78,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
 		return nil, syserror.EINVAL
 	}
 
-	fd := vp.newFD(mnt, vfsd, statusFlags)
+	fd := vp.newFD(mnt, vfsd, statusFlags, locks)
 
 	// Named pipes have special blocking semantics during open:
 	//
@@ -125,10 +130,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s
 }
 
 // Preconditions: vp.mu must be held.
-func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription {
+func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) *vfs.FileDescription {
 	fd := &VFSPipeFD{
 		pipe: &vp.pipe,
 	}
+	fd.LockFD.Init(locks)
 	fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
 		DenyPWrite:        true,
@@ -150,11 +156,14 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *
 	return &fd.vfsfd
 }
 
-// VFSPipeFD implements vfs.FileDescriptionImpl for pipes.
+// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
+// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
+// other FileDescriptions for splice(2) and tee(2).
 type VFSPipeFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	pipe *Pipe
 }
@@ -229,3 +238,216 @@ func (fd *VFSPipeFD) PipeSize() int64 {
 func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
 	return fd.pipe.SetFifoSize(size)
 }
+
+// IOSequence returns a useremm.IOSequence that reads up to count bytes from,
+// or writes up to count bytes to, fd.
+func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence {
+	return usermem.IOSequence{
+		IO:    fd,
+		Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}),
+	}
+}
+
+// CopyIn implements usermem.IO.CopyIn.
+func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(dst))
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return int64(len(dst))
+		},
+		limit: func(l int64) {
+			dst = dst[:l]
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadAt(dst, 0)
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// CopyOut implements usermem.IO.CopyOut.
+func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) {
+	origCount := int64(len(src))
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return int64(len(src))
+		},
+		limit: func(l int64) {
+			src = src[:l]
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Append(src)
+			return int64(len(src)), nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return int(n), syserror.ErrWouldBlock
+	}
+	return int(n), err
+}
+
+// ZeroOut implements usermem.IO.ZeroOut.
+func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
+	origCount := toZero
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return toZero
+		},
+		limit: func(l int64) {
+			toZero = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			view.Grow(view.Size()+toZero, true /* zero */)
+			return toZero, nil
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyInTo implements usermem.IO.CopyInTo.
+func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.read(ctx, readOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		read: func(view *buffer.View) (int64, error) {
+			n, err := view.ReadToSafememWriter(dst, uint64(count))
+			view.TrimFront(int64(n))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventOut)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// CopyOutFrom implements usermem.IO.CopyOutFrom.
+func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
+	count := ars.NumBytes()
+	if count == 0 {
+		return 0, nil
+	}
+	origCount := count
+	n, err := fd.pipe.write(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(view *buffer.View) (int64, error) {
+			n, err := view.WriteFromSafememReader(src, uint64(count))
+			return int64(n), err
+		},
+	})
+	if n > 0 {
+		fd.pipe.Notify(waiter.EventIn)
+	}
+	if err == nil && n != origCount {
+		return n, syserror.ErrWouldBlock
+	}
+	return n, err
+}
+
+// SwapUint32 implements usermem.IO.SwapUint32.
+func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
+	// How did a pipe get passed as the virtual address space to futex(2)?
+	panic("VFSPipeFD.SwapUint32 called unexpectedly")
+}
+
+// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
+func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
+}
+
+// LoadUint32 implements usermem.IO.LoadUint32.
+func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) {
+	panic("VFSPipeFD.LoadUint32 called unexpectedly")
+}
+
+// Splice reads up to count bytes from src and writes them to dst. It returns
+// the number of bytes moved.
+//
+// Preconditions: count > 0.
+func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
+}
+
+// Tee reads up to count bytes from src and writes them to dst, without
+// removing the read bytes from src. It returns the number of bytes copied.
+//
+// Preconditions: count > 0.
+func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
+	return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
+}
+
+// Preconditions: count > 0.
+func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
+	if dst.pipe == src.pipe {
+		return 0, syserror.EINVAL
+	}
+
+	lockTwoPipes(dst.pipe, src.pipe)
+	defer dst.pipe.mu.Unlock()
+	defer src.pipe.mu.Unlock()
+
+	n, err := dst.pipe.writeLocked(ctx, writeOps{
+		left: func() int64 {
+			return count
+		},
+		limit: func(l int64) {
+			count = l
+		},
+		write: func(dstView *buffer.View) (int64, error) {
+			return src.pipe.readLocked(ctx, readOps{
+				left: func() int64 {
+					return count
+				},
+				limit: func(l int64) {
+					count = l
+				},
+				read: func(srcView *buffer.View) (int64, error) {
+					n, err := srcView.ReadToSafememWriter(dstView, uint64(count))
+					if n > 0 && removeFromSrc {
+						srcView.TrimFront(int64(n))
+					}
+					return int64(n), err
+				},
+			})
+		},
+	})
+	if n > 0 {
+		dst.pipe.Notify(waiter.EventIn)
+		src.pipe.Notify(waiter.EventOut)
+	}
+	return n, err
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 00c425cca..9b69f3cbe 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -198,6 +198,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
+	oldFDTable := t.fdTable
+	t.fdTable = t.fdTable.Fork()
+	oldFDTable.DecRef()
+
 	// Remove FDs with the CloseOnExec flag set.
 	t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
 		return flags.CloseOnExec
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index c9db78e06..a5903b0b5 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -199,10 +199,10 @@ func (t *Task) doSyscall() taskRunState {
 	//
 	// On x86, register rax was shared by syscall number and return
 	// value, and at the entry of the syscall handler, the rax was
-	// saved to regs.orig_rax which was exposed to user space.
+	// saved to regs.orig_rax which was exposed to userspace.
 	// But on arm64, syscall number was passed through X8, and the X0
 	// was shared by the first syscall argument and return value. The
-	// X0 was saved to regs.orig_x0 which was not exposed to user space.
+	// X0 was saved to regs.orig_x0 which was not exposed to userspace.
 	// So we have to do the same operation here to save the X0 value
 	// into the task context.
 	t.Arch().SyscallSaveOrig()
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index da0ea7bb5..0adf25691 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() {
 	timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
 	t.wg.Add(1)
 	go func() { // S/R-SAFE: stopped during save.
+		defer t.wg.Done()
 		for {
 			// Start with an update immediately, so the clocks are
 			// ready ASAP.
@@ -220,7 +221,6 @@ func (t *Timekeeper) startUpdater() {
 			select {
 			case <-timer.C:
 			case <-t.stop:
-				t.wg.Done()
 				return
 			}
 		}
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 00977fc08..165869028 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -75,7 +75,7 @@ var _ fs.FileOperations = (*byteReader)(nil)
 
 // newByteReaderFile creates a fake file to read data from.
 //
-// TODO(gvisor.dev/issue/1623): Convert to VFS2.
+// TODO(gvisor.dev/issue/2921): Convert to VFS2.
 func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
 	// Create a fake inode.
 	inode := fs.NewInode(
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 73591dab7..a036ce53c 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -25,6 +25,7 @@ go_template_instance(
     out = "vma_set.go",
     consts = {
         "minDegree": "8",
+        "trackGaps": "1",
     },
     imports = {
         "usermem": "gvisor.dev/gvisor/pkg/usermem",
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 9a14e69e6..16d8207e9 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -195,7 +195,7 @@ func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
 
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
-	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() {
+	for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
 		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
 			// Can we shift up to match the alignment?
 			if offset := uint64(gr.Start) % alignment; offset != 0 {
@@ -214,7 +214,7 @@ func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bou
 
 // Preconditions: mm.mappingMu must be locked.
 func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
-	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() {
+	for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
 		if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
 			// Can we shift down to match the alignment?
 			start := gr.End - usermem.Addr(length)
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 1eeb9f317..a9836ba71 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -33,6 +33,7 @@ go_template_instance(
     out = "usage_set.go",
     consts = {
         "minDegree": "10",
+        "trackGaps": "1",
     },
     imports = {
         "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
@@ -48,6 +49,26 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "reclaim_set",
+    out = "reclaim_set.go",
+    consts = {
+        "minDegree": "10",
+    },
+    imports = {
+        "platform": "gvisor.dev/gvisor/pkg/sentry/platform",
+    },
+    package = "pgalloc",
+    prefix = "reclaim",
+    template = "//pkg/segment:generic_set",
+    types = {
+        "Key": "uint64",
+        "Range": "platform.FileRange",
+        "Value": "reclaimSetValue",
+        "Functions": "reclaimSetFunctions",
+    },
+)
+
 go_library(
     name = "pgalloc",
     srcs = [
@@ -56,6 +77,7 @@ go_library(
         "evictable_range_set.go",
         "pgalloc.go",
         "pgalloc_unsafe.go",
+        "reclaim_set.go",
         "save_restore.go",
         "usage_set.go",
     ],
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b11ea4ae..46f19d218 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -108,12 +108,6 @@ type MemoryFile struct {
 	usageSwapped  uint64
 	usageLast     time.Time
 
-	// minUnallocatedPage is the minimum page that may be unallocated.
-	// i.e., there are no unallocated pages below minUnallocatedPage.
-	//
-	// minUnallocatedPage is protected by mu.
-	minUnallocatedPage uint64
-
 	// fileSize is the size of the backing memory file in bytes. fileSize is
 	// always a power-of-two multiple of chunkSize.
 	//
@@ -146,11 +140,9 @@ type MemoryFile struct {
 	// is protected by mu.
 	reclaimable bool
 
-	// minReclaimablePage is the minimum page that may be reclaimable.
-	// i.e., all reclaimable pages are >= minReclaimablePage.
-	//
-	// minReclaimablePage is protected by mu.
-	minReclaimablePage uint64
+	// relcaim is the collection of regions for reclaim. relcaim is protected
+	// by mu.
+	reclaim reclaimSet
 
 	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
 	// transitions from false to true.
@@ -273,12 +265,10 @@ type evictableMemoryUserInfo struct {
 }
 
 const (
-	chunkShift = 24
-	chunkSize  = 1 << chunkShift // 16 MB
+	chunkShift = 30
+	chunkSize  = 1 << chunkShift // 1 GB
 	chunkMask  = chunkSize - 1
 
-	initialSize = chunkSize
-
 	// maxPage is the highest 64-bit page.
 	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
 )
@@ -302,19 +292,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
 	if err := file.Truncate(0); err != nil {
 		return nil, err
 	}
-	if err := file.Truncate(initialSize); err != nil {
-		return nil, err
-	}
 	f := &MemoryFile{
-		opts:     opts,
-		fileSize: initialSize,
-		file:     file,
-		// No pages are reclaimable. DecRef will always be able to
-		// decrease minReclaimablePage from this point.
-		minReclaimablePage: maxPage,
-		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+		opts:      opts,
+		file:      file,
+		evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
 	}
-	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.mappings.Store(make([]uintptr, 0))
 	f.reclaimCond.L = &f.mu
 
 	if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
@@ -404,39 +387,29 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		alignment = usermem.HugePageSize
 	}
 
-	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
-	end := start + length
-	// File offsets are int64s. Since length must be strictly positive, end
-	// cannot legitimately be 0.
-	if end < start || int64(end) <= 0 {
+	// Find a range in the underlying file.
+	fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
+	if !ok {
 		return platform.FileRange{}, syserror.ENOMEM
 	}
 
-	// Expand the file if needed. Double the file size on each expansion;
-	// uncommitted pages have effectively no cost.
-	fileSize := f.fileSize
-	for int64(end) > fileSize {
-		if fileSize >= 2*fileSize {
-			// fileSize overflow.
-			return platform.FileRange{}, syserror.ENOMEM
-		}
-		fileSize *= 2
-	}
-	if fileSize > f.fileSize {
-		if err := f.file.Truncate(fileSize); err != nil {
+	// Expand the file if needed.
+	if int64(fr.End) > f.fileSize {
+		// Round the new file size up to be chunk-aligned.
+		newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
+		if err := f.file.Truncate(newFileSize); err != nil {
 			return platform.FileRange{}, err
 		}
-		f.fileSize = fileSize
+		f.fileSize = newFileSize
 		f.mappingsMu.Lock()
 		oldMappings := f.mappings.Load().([]uintptr)
-		newMappings := make([]uintptr, fileSize>>chunkShift)
+		newMappings := make([]uintptr, newFileSize>>chunkShift)
 		copy(newMappings, oldMappings)
 		f.mappings.Store(newMappings)
 		f.mappingsMu.Unlock()
 	}
 
 	// Mark selected pages as in use.
-	fr := platform.FileRange{start, end}
 	if f.opts.ManualZeroing {
 		if err := f.forEachMappingSlice(fr, func(bs []byte) {
 			for i := range bs {
@@ -453,49 +426,71 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi
 		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
 	}
 
-	if minUnallocatedPage < start {
-		f.minUnallocatedPage = minUnallocatedPage
-	} else {
-		// start was the first unallocated page. The next must be
-		// somewhere beyond end.
-		f.minUnallocatedPage = end
-	}
-
 	return fr, nil
 }
 
-// findUnallocatedRange returns the first unallocated page in usage of the
-// specified length and alignment beginning at page start and the first single
-// unallocated page.
-func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
-	// Only searched until the first page is found.
-	firstPage := start
-	foundFirstPage := false
-	alignMask := alignment - 1
-	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
-		r := seg.Range()
+// findAvailableRange returns an available range in the usageSet.
+//
+// Note that scanning for available slots takes place from end first backwards,
+// then forwards. This heuristic has important consequence for how sequential
+// mappings can be merged in the host VMAs, given that addresses for both
+// application and sentry mappings are allocated top-down (from higher to
+// lower addresses). The file is also grown expoentially in order to create
+// space for mappings to be allocated downwards.
+//
+// Precondition: alignment must be a power of 2.
+func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (platform.FileRange, bool) {
+	alignmentMask := alignment - 1
+	for gap := usage.UpperBoundGap(uint64(fileSize)); gap.Ok(); gap = gap.PrevLargeEnoughGap(length) {
+		// Start searching only at end of file.
+		end := gap.End()
+		if end > uint64(fileSize) {
+			end = uint64(fileSize)
+		}
 
-		if !foundFirstPage && r.Start > firstPage {
-			foundFirstPage = true
+		// Start at the top and align downwards.
+		start := end - length
+		if start > end {
+			break // Underflow.
 		}
+		start &^= alignmentMask
 
-		if start >= r.End {
-			// start was rounded up to an alignment boundary from the end
-			// of a previous segment and is now beyond r.End.
+		// Is the gap still sufficient?
+		if start < gap.Start() {
 			continue
 		}
-		// This segment represents allocated or reclaimable pages; only the
-		// range from start to the segment's beginning is allocatable, and the
-		// next allocatable range begins after the segment.
-		if r.Start > start && r.Start-start >= length {
-			break
+
+		// Allocate in the given gap.
+		return platform.FileRange{start, start + length}, true
+	}
+
+	// Check that it's possible to fit this allocation at the end of a file of any size.
+	min := usage.LastGap().Start()
+	min = (min + alignmentMask) &^ alignmentMask
+	if min+length < min {
+		// Overflow.
+		return platform.FileRange{}, false
+	}
+
+	// Determine the minimum file size required to fit this allocation at its end.
+	for {
+		if fileSize >= 2*fileSize {
+			// Is this because it's initially empty?
+			if fileSize == 0 {
+				fileSize += chunkSize
+			} else {
+				// fileSize overflow.
+				return platform.FileRange{}, false
+			}
+		} else {
+			// Double the current fileSize.
+			fileSize *= 2
 		}
-		start = (r.End + alignMask) &^ alignMask
-		if !foundFirstPage {
-			firstPage = r.End
+		start := (uint64(fileSize) - length) &^ alignmentMask
+		if start >= min {
+			return platform.FileRange{start, start + length}, true
 		}
 	}
-	return start, firstPage
 }
 
 // AllocateAndFill allocates memory of the given kind and fills it by calling
@@ -616,6 +611,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 		}
 		val.refs--
 		if val.refs == 0 {
+			f.reclaim.Add(seg.Range(), reclaimSetValue{})
 			freed = true
 			// Reclassify memory as System, until it's freed by the reclaim
 			// goroutine.
@@ -628,10 +624,6 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) {
 	f.usage.MergeAdjacent(fr)
 
 	if freed {
-		if fr.Start < f.minReclaimablePage {
-			// We've freed at least one lower page.
-			f.minReclaimablePage = fr.Start
-		}
 		f.reclaimable = true
 		f.reclaimCond.Signal()
 	}
@@ -1030,6 +1022,7 @@ func (f *MemoryFile) String() string {
 // for allocation.
 func (f *MemoryFile) runReclaim() {
 	for {
+		// N.B. We must call f.markReclaimed on the returned FrameRange.
 		fr, ok := f.findReclaimable()
 		if !ok {
 			break
@@ -1085,6 +1078,10 @@ func (f *MemoryFile) runReclaim() {
 	}
 }
 
+// findReclaimable finds memory that has been marked for reclaim.
+//
+// Note that there returned range will be removed from tracking. It
+// must be reclaimed (removed from f.usage) at this point.
 func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -1103,18 +1100,15 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
 			}
 			f.reclaimCond.Wait()
 		}
-		// Allocate returns the first usable range in offset order and is
-		// currently a linear scan, so reclaiming from the beginning of the
-		// file minimizes the expected latency of Allocate.
-		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
-			if seg.ValuePtr().refs == 0 {
-				f.minReclaimablePage = seg.End()
-				return seg.Range(), true
-			}
+		// Allocate works from the back of the file inwards, so reclaim
+		// preserves this order to minimize the cost of the search.
+		if seg := f.reclaim.LastSegment(); seg.Ok() {
+			fr := seg.Range()
+			f.reclaim.Remove(seg)
+			return fr, true
 		}
-		// No pages are reclaimable.
+		// Nothing is reclaimable.
 		f.reclaimable = false
-		f.minReclaimablePage = maxPage
 	}
 }
 
@@ -1122,8 +1116,8 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	seg := f.usage.FindSegment(fr.Start)
-	// All of fr should be mapped to a single uncommitted reclaimable segment
-	// accounted to System.
+	// All of fr should be mapped to a single uncommitted reclaimable
+	// segment accounted to System.
 	if !seg.Ok() {
 		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
 	}
@@ -1137,14 +1131,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
 	}); got != want {
 		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
 	}
-	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
-	// caller of markReclaimed may not have decommitted it, so we can only mark
-	// fr as reclaimed.
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable,
+	// the caller of markReclaimed may not have decommitted it, so we can
+	// only mark fr as reclaimed.
 	f.usage.Remove(f.usage.Isolate(seg, fr))
-	if fr.Start < f.minUnallocatedPage {
-		// We've deallocated at least one lower page.
-		f.minUnallocatedPage = fr.Start
-	}
 }
 
 // StartEvictions requests that f evict all evictable allocations. It does not
@@ -1255,3 +1245,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal
 func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
 	return evictableRangeSetValue{}, evictableRangeSetValue{}
 }
+
+// reclaimSetValue is the value type of reclaimSet.
+type reclaimSetValue struct{}
+
+type reclaimSetFunctions struct{}
+
+func (reclaimSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (reclaimSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
+}
+
+func (reclaimSetFunctions) Merge(_ platform.FileRange, _ reclaimSetValue, _ platform.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
+	return reclaimSetValue{}, true
+}
+
+func (reclaimSetFunctions) Split(_ platform.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
+	return reclaimSetValue{}, reclaimSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go
index 293f22c6b..b5b68eb52 100644
--- a/pkg/sentry/pgalloc/pgalloc_test.go
+++ b/pkg/sentry/pgalloc/pgalloc_test.go
@@ -23,39 +23,49 @@ import (
 const (
 	page     = usermem.PageSize
 	hugepage = usermem.HugePageSize
+	topPage  = (1 << 63) - page
 )
 
 func TestFindUnallocatedRange(t *testing.T) {
 	for _, test := range []struct {
-		desc           string
-		usage          *usageSegmentDataSlices
-		start          uint64
-		length         uint64
-		alignment      uint64
-		unallocated    uint64
-		minUnallocated uint64
+		desc       string
+		usage      *usageSegmentDataSlices
+		fileSize   int64
+		length     uint64
+		alignment  uint64
+		start      uint64
+		expectFail bool
 	}{
 		{
-			desc:           "Initial allocation succeeds",
-			usage:          &usageSegmentDataSlices{},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			desc:      "Initial allocation succeeds",
+			usage:     &usageSegmentDataSlices{},
+			length:    page,
+			alignment: page,
+			start:     chunkSize - page, // Grows by chunkSize, allocate down.
 		},
 		{
-			desc: "Allocation begins at start of file",
+			desc: "Allocation finds empty space at start of file",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page},
 				End:    []uint64{2 * page},
 				Values: []usageInfo{{refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    0,
-			minUnallocated: 0,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     0,
+		},
+		{
+			desc: "Allocation finds empty space at end of file",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{0},
+				End:    []uint64{page},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "In-use frames are not allocatable",
@@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  2 * page,
+			length:    page,
+			alignment: page,
+			start:     3 * page, // Double fileSize, allocate top-down.
 		},
 		{
 			desc: "Reclaimable frames are not allocatable",
@@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 2 * page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: 3 * page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     5 * page, // Double fileSize, grow down.
 		},
 		{
 			desc: "Gaps between in-use frames are allocatable",
@@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         page,
-			alignment:      page,
-			unallocated:    page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    page,
+			alignment: page,
+			start:     page,
 		},
 		{
 			desc: "Inadequately-sized gaps are rejected",
@@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, 3 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         2 * page,
-			alignment:      page,
-			unallocated:    3 * page,
-			minUnallocated: page,
+			fileSize:  3 * page,
+			length:    2 * page,
+			alignment: page,
+			start:     4 * page, // Double fileSize, grow down.
 		},
 		{
-			desc: "Hugepage alignment is honored",
+			desc: "Alignment is honored at end of file",
 			usage: &usageSegmentDataSlices{
 				Start: []uint64{0, hugepage + page},
 				// Hugepage-sized gap here that shouldn't be allocated from
@@ -118,37 +124,95 @@ func TestFindUnallocatedRange(t *testing.T) {
 				End:    []uint64{page, hugepage + 2*page},
 				Values: []usageInfo{{refs: 1}, {refs: 1}},
 			},
-			start:          0,
-			length:         hugepage,
-			alignment:      hugepage,
-			unallocated:    2 * hugepage,
-			minUnallocated: page,
+			fileSize:  hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     3 * hugepage, // Double fileSize until alignment is satisfied, grow down.
+		},
+		{
+			desc: "Alignment is honored before end of file",
+			usage: &usageSegmentDataSlices{
+				Start: []uint64{0, 2*hugepage + page},
+				// Page will need to be shifted down from top.
+				End:    []uint64{page, 2*hugepage + 2*page},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  2*hugepage + 2*page,
+			length:    hugepage,
+			alignment: hugepage,
+			start:     hugepage,
 		},
 		{
-			desc: "Pages before start ignored",
+			desc: "Allocations are compact if possible",
 			usage: &usageSegmentDataSlices{
 				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  4 * page,
+			length:    page,
+			alignment: page,
+			start:     2 * page,
+		},
+		{
+			desc: "Top-down allocation within one gap",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 4 * page, 7 * page},
+				End:    []uint64{2 * page, 5 * page, 8 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     6 * page,
+		},
+		{
+			desc: "Top-down allocation between multiple gaps",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, 3 * page, 5 * page},
+				End:    []uint64{2 * page, 4 * page, 6 * page},
+				Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}},
+			},
+			fileSize:  6 * page,
+			length:    page,
+			alignment: page,
+			start:     4 * page,
 		},
 		{
-			desc: "start may be in the middle of segment",
+			desc: "Top-down allocation with large top gap",
 			usage: &usageSegmentDataSlices{
-				Start:  []uint64{0, 3 * page},
+				Start:  []uint64{page, 3 * page},
 				End:    []uint64{2 * page, 4 * page},
 				Values: []usageInfo{{refs: 1}, {refs: 2}},
 			},
-			start:          page,
-			length:         page,
-			alignment:      page,
-			unallocated:    2 * page,
-			minUnallocated: 2 * page,
+			fileSize:  8 * page,
+			length:    page,
+			alignment: page,
+			start:     7 * page,
+		},
+		{
+			desc: "Gaps found with possible overflow",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page, topPage - page},
+				End:    []uint64{2 * page, topPage},
+				Values: []usageInfo{{refs: 1}, {refs: 1}},
+			},
+			fileSize:  topPage,
+			length:    page,
+			alignment: page,
+			start:     topPage - 2*page,
+		},
+		{
+			desc: "Overflow detected",
+			usage: &usageSegmentDataSlices{
+				Start:  []uint64{page},
+				End:    []uint64{topPage},
+				Values: []usageInfo{{refs: 1}},
+			},
+			fileSize:   topPage,
+			length:     2 * page,
+			alignment:  page,
+			expectFail: true,
 		},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
@@ -156,12 +220,18 @@ func TestFindUnallocatedRange(t *testing.T) {
 			if err := usage.ImportSortedSlices(test.usage); err != nil {
 				t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err)
 			}
-			unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment)
-			if unallocated != test.unallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated)
+			fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment)
+			if !test.expectFail && !ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if test.expectFail && ok {
+				t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
+			}
+			if ok && fr.Start != test.start {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start)
 			}
-			if minUnallocated != test.minUnallocated {
-				t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated)
+			if ok && fr.End != test.start+test.length {
+				t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length)
 			}
 		})
 	}
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..4792454c4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -6,8 +6,8 @@ go_library(
     name = "kvm",
     srcs = [
         "address_space.go",
-        "allocator.go",
         "bluepill.go",
+        "bluepill_allocator.go",
         "bluepill_amd64.go",
         "bluepill_amd64.s",
         "bluepill_amd64_unsafe.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index be213bfe8..faf1d5e1c 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -26,16 +26,15 @@ import (
 
 // dirtySet tracks vCPUs for invalidation.
 type dirtySet struct {
-	vCPUs []uint64
+	vCPUMasks []uint64
 }
 
 // forEach iterates over all CPUs in the dirty set.
+//
+//go:nosplit
 func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
-	m.mu.RLock()
-	defer m.mu.RUnlock()
-
-	for index := range ds.vCPUs {
-		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+	for index := range ds.vCPUMasks {
+		mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
 		if mask != 0 {
 			for bit := 0; bit < 64; bit++ {
 				if mask&(1<<uint64(bit)) == 0 {
@@ -54,7 +53,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	index := uint64(c.id) / 64
 	bit := uint64(1) << uint(c.id%64)
 
-	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
 	if oldValue&bit != 0 {
 		return false // Not clean.
 	}
@@ -62,7 +61,7 @@ func (ds *dirtySet) mark(c *vCPU) bool {
 	// Set the bit unilaterally, and ensure that a flush takes place. Note
 	// that it's possible for races to occur here, but since the flush is
 	// taking place long after these lines there's no race in practice.
-	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
 	return true // Previously clean.
 }
 
@@ -113,7 +112,12 @@ type hostMapEntry struct {
 	length uintptr
 }
 
-func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+// mapLocked maps the given host entry.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
 	for m.length > 0 {
 		physical, length, ok := translateToPhysical(m.addr)
 		if !ok {
@@ -133,18 +137,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac
 		// important; if the pagetable mappings were installed before
 		// ensuring the physical pages were available, then some other
 		// thread could theoretically access them.
-		//
-		// Due to the way KVM's shadow paging implementation works,
-		// modifications to the page tables while in host mode may not
-		// be trapped, leading to the shadow pages being out of sync.
-		// Therefore, we need to ensure that we are in guest mode for
-		// page table modifications. See the call to bluepill, below.
-		as.machine.retryInGuest(func() {
-			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
-				AccessType: at,
-				User:       true,
-			}, physical) || inv
-		})
+		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+			AccessType: at,
+			User:       true,
+		}, physical) || inv
 		m.addr += length
 		m.length -= length
 		addr += usermem.Addr(length)
@@ -176,6 +172,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 		return err
 	}
 
+	// See block in mapLocked.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+
 	// Map the mappings in the sentry's address space (guest physical memory)
 	// into the application's address space (guest virtual memory).
 	inv := false
@@ -190,7 +190,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 				_ = s[i] // Touch to commit.
 			}
 		}
-		prev := as.mapHost(addr, hostMapEntry{
+
+		// See bluepill_allocator.go.
+		bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+		// Perform the mapping.
+		prev := as.mapLocked(addr, hostMapEntry{
 			addr:   b.Addr(),
 			length: uintptr(b.Len()),
 		}, at)
@@ -204,17 +209,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.
 	return nil
 }
 
+// unmapLocked is an escape-checked wrapped around Unmap.
+//
+// +checkescape:hard,stack
+//
+//go:nosplit
+func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool {
+	return as.pageTables.Unmap(addr, uintptr(length))
+}
+
 // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
 	as.mu.Lock()
 	defer as.mu.Unlock()
 
-	// See above re: retryInGuest.
-	var prev bool
-	as.machine.retryInGuest(func() {
-		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
-	})
-	if prev {
+	// See above & bluepill_allocator.go.
+	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
+	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
+	bluepill(as.pageTables.Allocator.(*allocator).cpu)
+
+	if prev := as.unmapLocked(addr, length); prev {
+		// Invalidate all active vCPUs.
 		as.invalidate()
 
 		// Recycle any freed intermediate pages.
@@ -227,7 +242,7 @@ func (as *addressSpace) Release() {
 	as.Unmap(0, ^uint64(0))
 
 	// Free all pages from the allocator.
-	as.pageTables.Allocator.(allocator).base.Drain()
+	as.pageTables.Allocator.(*allocator).base.Drain()
 
 	// Drop all cached machine references.
 	as.machine.dropPageTables(as.pageTables)
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go
index 3f35414bb..9485e1301 100644
--- a/pkg/sentry/platform/kvm/allocator.go
+++ b/pkg/sentry/platform/kvm/bluepill_allocator.go
@@ -21,56 +21,80 @@ import (
 )
 
 type allocator struct {
-	base *pagetables.RuntimeAllocator
+	base pagetables.RuntimeAllocator
+
+	// cpu must be set prior to any pagetable operation.
+	//
+	// Due to the way KVM's shadow paging implementation works,
+	// modifications to the page tables while in host mode may not be
+	// trapped, leading to the shadow pages being out of sync.  Therefore,
+	// we need to ensure that we are in guest mode for page table
+	// modifications. See the call to bluepill, below.
+	cpu *vCPU
 }
 
 // newAllocator is used to define the allocator.
-func newAllocator() allocator {
-	return allocator{
-		base: pagetables.NewRuntimeAllocator(),
-	}
+func newAllocator() *allocator {
+	a := new(allocator)
+	a.base.Init()
+	return a
 }
 
 // NewPTEs implements pagetables.Allocator.NewPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) NewPTEs() *pagetables.PTEs {
-	return a.base.NewPTEs()
+func (a *allocator) NewPTEs() *pagetables.PTEs {
+	ptes := a.base.NewPTEs() // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
+	return ptes
 }
 
 // PhysicalFor returns the physical address for a set of PTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
 	virtual := a.base.PhysicalFor(ptes)
 	physical, _, ok := translateToPhysical(virtual)
 	if !ok {
-		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic.
 	}
 	return physical
 }
 
 // LookupPTEs implements pagetables.Allocator.LookupPTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
 	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions)
 	if !ok {
-		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic.
 	}
 	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
 }
 
 // FreePTEs implements pagetables.Allocator.FreePTEs.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
-	a.base.FreePTEs(ptes)
+func (a *allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes) // escapes: bluepill below.
+	if a.cpu != nil {
+		bluepill(a.cpu)
+	}
 }
 
 // Recycle implements pagetables.Allocator.Recycle.
 //
 //go:nosplit
-func (a allocator) Recycle() {
+func (a *allocator) Recycle() {
 	a.base.Recycle()
 }
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 133c2203d..ddc1554d5 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
@@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() {
 	// We only trigger a bluepill entry in the bluepill function, and can
 	// therefore be guaranteed that there is no floating point state to be
 	// loaded on resuming from halt. We only worry about saving on exit.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
 }
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
@@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
 		regs.Rip = 0
 	}
 	// See above.
-	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
 	ring0.Halt()
-	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+	ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
 }
 
 // bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 99cac665d..0d1e83e6c 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -53,3 +53,10 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
 	context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
 	context.Rip = uint64(dieTrampolineAddr)
 }
+
+// getHypercallID returns hypercall ID.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+	return _KVM_HYPERCALL_MAX
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index c215d443c..83643c602 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -66,6 +66,8 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
 
 // KernelSyscall handles kernel syscalls.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelSyscall() {
 	regs := c.Registers()
@@ -88,6 +90,8 @@ func (c *vCPU) KernelSyscall() {
 
 // KernelException handles kernel exceptions.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *vCPU) KernelException(vector ring0.Vector) {
 	regs := c.Registers()
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 4ca2b7717..abd36f973 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -61,3 +61,16 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
 func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext {
 	return &((*arch.SignalContext64)(context).Fpsimd64)
 }
+
+// getHypercallID returns hypercall ID.
+//
+// On Arm64, the MMIO address should be 64-bit aligned.
+//
+//go:nosplit
+func getHypercallID(addr uintptr) int {
+	if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) {
+		return _KVM_HYPERCALL_MAX
+	} else {
+		return int(((addr) - arm64HypercallMMIOBase) >> 3)
+	}
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..a5b9be36d 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -58,12 +58,32 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 	return &((*arch.UContext64)(context).MContext)
 }
 
+// bluepillHandleHlt is reponsible for handling VM-Exit.
+//
+//go:nosplit
+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
+	// Copy out registers.
+	bluepillArchExit(c, bluepillArchContext(context))
+
+	// Return to the vCPUReady state; notify any waiters.
+	user := atomic.LoadUint32(&c.state) & vCPUUser
+	switch atomic.SwapUint32(&c.state, user) {
+	case user | vCPUGuest: // Expected case.
+	case user | vCPUGuest | vCPUWaiter:
+		c.notify()
+	default:
+		throw("invalid state")
+	}
+}
+
 // bluepillHandler is called from the signal stub.
 //
 // The world may be stopped while this is executing, and it executes on the
 // signal stack. It should only execute raw system calls and functions that are
 // explicitly marked go:nosplit.
 //
+// +checkescape:all
+//
 //go:nosplit
 func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
@@ -82,7 +102,8 @@ func bluepillHandler(context unsafe.Pointer) {
 	}
 
 	for {
-		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
+		switch errno {
 		case 0: // Expected case.
 		case syscall.EINTR:
 			// First, we process whatever pending signal
@@ -90,7 +111,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// currently, all signals are masked and the signal
 			// must have been delivered directly to this thread.
 			timeout := syscall.Timespec{}
-			sig, _, errno := syscall.RawSyscall6(
+			sig, _, errno := syscall.RawSyscall6( // escapes: no.
 				syscall.SYS_RT_SIGTIMEDWAIT,
 				uintptr(unsafe.Pointer(&bounceSignalMask)),
 				0,                                 // siginfo.
@@ -125,7 +146,7 @@ func bluepillHandler(context unsafe.Pointer) {
 			// MMIO exit we receive EFAULT from the run ioctl. We
 			// always inject an NMI here since we may be in kernel
 			// mode and have interrupts disabled.
-			if _, _, errno := syscall.RawSyscall(
+			if _, _, errno := syscall.RawSyscall( // escapes: no.
 				syscall.SYS_IOCTL,
 				uintptr(c.fd),
 				_KVM_NMI, 0); errno != 0 {
@@ -156,25 +177,20 @@ func bluepillHandler(context unsafe.Pointer) {
 			c.die(bluepillArchContext(context), "debug")
 			return
 		case _KVM_EXIT_HLT:
-			// Copy out registers.
-			bluepillArchExit(c, bluepillArchContext(context))
-
-			// Return to the vCPUReady state; notify any waiters.
-			user := atomic.LoadUint32(&c.state) & vCPUUser
-			switch atomic.SwapUint32(&c.state, user) {
-			case user | vCPUGuest: // Expected case.
-			case user | vCPUGuest | vCPUWaiter:
-				c.notify()
-			default:
-				throw("invalid state")
-			}
+			bluepillGuestExit(c, context)
 			return
 		case _KVM_EXIT_MMIO:
+			physical := uintptr(c.runData.data[0])
+			if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
+				bluepillGuestExit(c, context)
+				return
+			}
+
 			// Increment the fault count.
 			atomic.AddUint32(&c.faults, 1)
 
 			// For MMIO, the physical address is the first data item.
-			physical := uintptr(c.runData.data[0])
+			physical = uintptr(c.runData.data[0])
 			virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
 			if !ok {
 				c.die(bluepillArchContext(context), "invalid physical address")
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
index 29d457a7e..3134a076b 100644
--- a/pkg/sentry/platform/kvm/kvm_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -26,6 +26,9 @@ type kvmOneReg struct {
 	addr uint64
 }
 
+// arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls.
+var arm64HypercallMMIOBase uintptr
+
 const KVM_NR_SPSR = 5
 
 type userFpsimdState struct {
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..6a7676468 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -71,3 +71,13 @@ const (
 	_KVM_MEM_READONLY        = uint32(1) << 1
 	_KVM_MEM_FLAGS_NONE      = 0
 )
+
+// KVM hypercall list.
+// Canonical list of hypercalls supported.
+const (
+	// On amd64, it uses 'HLT' to leave the guest.
+	// Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest.
+	// _KVM_HYPERCALL_VMEXIT is only used on Arm64 for now.
+	_KVM_HYPERCALL_VMEXIT int = iota
+	_KVM_HYPERCALL_MAX
+)
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 531ae8b1e..6f0539c29 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -131,3 +131,10 @@ const (
 	_ESR_SEGV_PEMERR_L2 = 0xe
 	_ESR_SEGV_PEMERR_L3 = 0xf
 )
+
+// Arm64: MMIO base address used to dispatch hypercalls.
+const (
+	// on Arm64, the MMIO address must be 64-bit aligned.
+	// Currently, we only need 1 hypercall: hypercall_vmexit.
+	_AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3
+)
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..6c54712d1 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -52,16 +52,19 @@ type machine struct {
 	// available is notified when vCPUs are available.
 	available sync.Cond
 
-	// vCPUs are the machine vCPUs.
+	// vCPUsByTID are the machine vCPUs.
 	//
 	// These are populated dynamically.
-	vCPUs map[uint64]*vCPU
+	vCPUsByTID map[uint64]*vCPU
 
 	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
-	vCPUsByID map[int]*vCPU
+	vCPUsByID []*vCPU
 
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
+
+	// nextID is the next vCPU ID.
+	nextID uint32
 }
 
 const (
@@ -137,9 +140,8 @@ type dieState struct {
 //
 // Precondition: mu must be held.
 func (m *machine) newVCPU() *vCPU {
-	id := len(m.vCPUs)
-
 	// Create the vCPU.
+	id := int(atomic.AddUint32(&m.nextID, 1) - 1)
 	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
 	if errno != 0 {
 		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
@@ -176,11 +178,7 @@ func (m *machine) newVCPU() *vCPU {
 // newMachine returns a new VM context.
 func newMachine(vm int) (*machine, error) {
 	// Create the machine.
-	m := &machine{
-		fd:        vm,
-		vCPUs:     make(map[uint64]*vCPU),
-		vCPUsByID: make(map[int]*vCPU),
-	}
+	m := &machine{fd: vm}
 	m.available.L = &m.mu
 	m.kernel.Init(ring0.KernelOpts{
 		PageTables: pagetables.New(newAllocator()),
@@ -194,6 +192,10 @@ func newMachine(vm int) (*machine, error) {
 	}
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
 
+	// Create the vCPUs map/slices.
+	m.vCPUsByTID = make(map[uint64]*vCPU)
+	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
+
 	// Apply the physical mappings. Note that these mappings may point to
 	// guest physical addresses that are not actually available. These
 	// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -274,6 +276,8 @@ func newMachine(vm int) (*machine, error) {
 // not available. This attempts to be efficient for calls in the hot path.
 //
 // This panics on error.
+//
+//go:nosplit
 func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) {
 	for end := physical + length; physical < end; {
 		_, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions)
@@ -304,7 +308,11 @@ func (m *machine) Destroy() {
 	runtime.SetFinalizer(m, nil)
 
 	// Destroy vCPUs.
-	for _, c := range m.vCPUs {
+	for _, c := range m.vCPUsByID {
+		if c == nil {
+			continue
+		}
+
 		// Ensure the vCPU is not still running in guest mode. This is
 		// possible iff teardown has been done by other threads, and
 		// somehow a single thread has not executed any system calls.
@@ -337,7 +345,7 @@ func (m *machine) Get() *vCPU {
 	tid := procid.Current()
 
 	// Check for an exact match.
-	if c := m.vCPUs[tid]; c != nil {
+	if c := m.vCPUsByTID[tid]; c != nil {
 		c.lock()
 		m.mu.RUnlock()
 		return c
@@ -356,7 +364,7 @@ func (m *machine) Get() *vCPU {
 	tid = procid.Current()
 
 	// Recheck for an exact match.
-	if c := m.vCPUs[tid]; c != nil {
+	if c := m.vCPUsByTID[tid]; c != nil {
 		c.lock()
 		m.mu.Unlock()
 		return c
@@ -364,10 +372,10 @@ func (m *machine) Get() *vCPU {
 
 	for {
 		// Scan for an available vCPU.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
-				delete(m.vCPUs, origTID)
-				m.vCPUs[tid] = c
+				delete(m.vCPUsByTID, origTID)
+				m.vCPUsByTID[tid] = c
 				m.mu.Unlock()
 				c.loadSegments(tid)
 				return c
@@ -375,17 +383,17 @@ func (m *machine) Get() *vCPU {
 		}
 
 		// Create a new vCPU (maybe).
-		if len(m.vCPUs) < m.maxVCPUs {
+		if int(m.nextID) < m.maxVCPUs {
 			c := m.newVCPU()
 			c.lock()
-			m.vCPUs[tid] = c
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
 		}
 
 		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUs {
+		for origTID, c := range m.vCPUsByTID {
 			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
 				continue
 			}
@@ -403,8 +411,8 @@ func (m *machine) Get() *vCPU {
 			}
 
 			// Steal the vCPU.
-			delete(m.vCPUs, origTID)
-			m.vCPUs[tid] = c
+			delete(m.vCPUsByTID, origTID)
+			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
 			c.loadSegments(tid)
 			return c
@@ -431,7 +439,7 @@ func (m *machine) Put(c *vCPU) {
 // newDirtySet returns a new dirty set.
 func (m *machine) newDirtySet() *dirtySet {
 	return &dirtySet{
-		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+		vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 923ce3909..acc823ba6 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -51,9 +51,10 @@ func (m *machine) initArchState() error {
 		recover()
 		debug.SetPanicOnFault(old)
 	}()
-	m.retryInGuest(func() {
-		ring0.SetCPUIDFaulting(true)
-	})
+	c := m.Get()
+	defer m.Put(c)
+	bluepill(c)
+	ring0.SetCPUIDFaulting(true)
 
 	return nil
 }
@@ -89,8 +90,8 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	defer m.mu.Unlock()
 
 	// Clear from all PCIDs.
-	for _, c := range m.vCPUs {
-		if c.PCIDs != nil {
+	for _, c := range m.vCPUsByID {
+		if c != nil && c.PCIDs != nil {
 			c.PCIDs.Drop(pt)
 		}
 	}
@@ -335,29 +336,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
 	}
 }
 
-// retryInGuest runs the given function in guest mode.
-//
-// If the function does not complete in guest mode (due to execution of a
-// system call due to a GC stall, for example), then it will be retried. The
-// given function must be idempotent as a result of the retry mechanism.
-func (m *machine) retryInGuest(fn func()) {
-	c := m.Get()
-	defer m.Put(c)
-	for {
-		c.ClearErrorCode() // See below.
-		bluepill(c)        // Force guest mode.
-		fn()               // Execute the given function.
-		_, user := c.ErrorCode()
-		if user {
-			// If user is set, then we haven't bailed back to host
-			// mode via a kernel exception or system call. We
-			// consider the full function to have executed in guest
-			// mode and we can return.
-			break
-		}
-	}
-}
-
 // On x86 platform, the flags for "setMemoryRegion" can always be set as 0.
 // There is no need to return read-only physicalRegions.
 func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..290f035dd 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -154,7 +154,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error {
 //
 //go:nosplit
 func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
-	if _, _, errno := syscall.RawSyscall(
+	if _, _, errno := syscall.RawSyscall( // escapes: no.
 		syscall.SYS_IOCTL,
 		uintptr(c.fd),
 		_KVM_GET_REGS,
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index e42505542..f3bf973de 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -60,6 +60,12 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
 		if !vr.accessType.Write && vr.accessType.Read {
 			rdonlyRegions = append(rdonlyRegions, vr.region)
 		}
+
+		// TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated.
+		// Workaround: treated as rdonly temporarily.
+		if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute {
+			rdonlyRegions = append(rdonlyRegions, vr.region)
+		}
 	})
 
 	for _, r := range rdonlyRegions {
@@ -100,7 +106,7 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	defer m.mu.Unlock()
 
 	// Clear from all PCIDs.
-	for _, c := range m.vCPUs {
+	for _, c := range m.vCPUsByID {
 		if c.PCIDs != nil {
 			c.PCIDs.Drop(pt)
 		}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 3c02cef7c..48c834499 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -159,6 +159,10 @@ func (c *vCPU) initArchState() error {
 		return err
 	}
 
+	// Use the address of the exception vector table as
+	// the MMIO address base.
+	arm64HypercallMMIOBase = toLocation
+
 	data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
 	reg.id = _KVM_ARM64_REGS_PSTATE
 	if err := c.setOneRegister(&reg); err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..9f86f6a7a 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
@@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
 //
 //go:nosplit
 func (c *vCPU) notify() {
-	_, _, errno := syscall.RawSyscall6(
+	_, _, errno := syscall.RawSyscall6( // escapes: no.
 		syscall.SYS_FUTEX,
 		uintptr(unsafe.Pointer(&c.state)),
 		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
index 2ae6b9f9d..0bee995e4 100644
--- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index db6465663..2bc5f3ecd 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -362,9 +362,17 @@ mmio_exit:
 	MOVD R1, CPU_LAZY_VFP(RSV_REG)
 	VFP_DISABLE
 
-	// MMIO_EXIT.
-	MOVD $0, R9
-	MOVD R0, 0xffff000000001000(R9)
+	// Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT.
+	//
+	// To keep it simple, I used the address of exception table as the
+	// MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing
+	// a read-only space.
+	// Also, the length is engough to match a sufficient number of hypercall ID.
+	// Then, in host user space, I can calculate this address to find out
+	// which hypercall.
+	MRS VBAR_EL1, R9
+	MOVD R0, 0x0(R9)
+
 	RET
 
 // HaltAndResume halts execution and point the pointer to the resume function.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 900c0bba7..021693791 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -31,23 +31,39 @@ type defaultHooks struct{}
 
 // KernelSyscall implements Hooks.KernelSyscall.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelSyscall() { Halt() }
+func (defaultHooks) KernelSyscall() {
+	Halt()
+}
 
 // KernelException implements Hooks.KernelException.
 //
+// +checkescape:all
+//
 //go:nosplit
-func (defaultHooks) KernelException(Vector) { Halt() }
+func (defaultHooks) KernelException(Vector) {
+	Halt()
+}
 
 // kernelSyscall is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+func kernelSyscall(c *CPU) {
+	c.hooks.KernelSyscall()
+}
 
 // kernelException is a trampoline.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
-func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+func kernelException(c *CPU, vector Vector) {
+	c.hooks.KernelException(vector)
+}
 
 // Init initializes a new CPU.
 //
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 0feff8778..d37981dbf 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -178,6 +178,8 @@ func IsCanonical(addr uint64) bool {
 //
 // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
 //
+// +checkescape:all
+//
 //go:nosplit
 func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
@@ -192,9 +194,9 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 
 	// Perform the switch.
 	swapgs()                                         // GS will be swapped on return.
-	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
-	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
-	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	WriteFS(uintptr(regs.Fs_base))                   // escapes: no. Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // escapes: no. Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
 	jumpToKernel()                                   // Switch to upper half.
 	writeCR3(uintptr(userCR3))                       // Change to user address space.
 	if switchOpts.FullRestore {
@@ -204,8 +206,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
 	}
 	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
 	jumpToUser()                                     // Return to lower half.
-	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
-	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
+	WriteFS(uintptr(c.registers.Fs_base))            // escapes: no. Restore kernel FS.
 	return
 }
 
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 444a83913..a6345010d 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -38,6 +38,12 @@ func SaveVRegs(*byte)
 // LoadVRegs loads V0-V31 registers.
 func LoadVRegs(*byte)
 
+// GetTLS returns the value of TPIDR_EL0 register.
+func GetTLS() (value uint64)
+
+// SetTLS writes the TPIDR_EL0 value.
+func SetTLS(value uint64)
+
 // Init sets function pointers based on architectural features.
 //
 // This must be called prior to using ring0.
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
index 0e6a6235b..b63e14b41 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.s
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -15,6 +15,16 @@
 #include "funcdata.h"
 #include "textflag.h"
 
+TEXT ·GetTLS(SB),NOSPLIT,$0-8
+	MRS TPIDR_EL0, R1
+	MOVD R1, ret+0(FP)
+	RET
+
+TEXT ·SetTLS(SB),NOSPLIT,$0-8
+	MOVD addr+0(FP), R1
+	MSR R1, TPIDR_EL0
+	RET
+
 TEXT ·CPACREL1(SB),NOSPLIT,$0-8
 	WORD $0xd5381041 	// MRS CPACR_EL1, R1
 	MOVD R1, ret+0(FP)
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
index 23fd5c352..8d75b7599 100644
--- a/pkg/sentry/platform/ring0/pagetables/allocator.go
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -53,9 +53,14 @@ type RuntimeAllocator struct {
 
 // NewRuntimeAllocator returns an allocator that uses runtime allocation.
 func NewRuntimeAllocator() *RuntimeAllocator {
-	return &RuntimeAllocator{
-		used: make(map[*PTEs]struct{}),
-	}
+	r := new(RuntimeAllocator)
+	r.Init()
+	return r
+}
+
+// Init initializes a RuntimeAllocator.
+func (r *RuntimeAllocator) Init() {
+	r.used = make(map[*PTEs]struct{})
 }
 
 // Recycle returns freed pages to the pool.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 87e88e97d..7f18ac296 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true }
 //
 // Precondition: addr & length must be page-aligned, their sum must not overflow.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
 	if !opts.AccessType.Any() {
@@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
 	w := unmapWalker{
@@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
 //
 // Precondition: addr & length must be page-aligned.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
 	w := emptyWalker{
@@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false }
 
 // Lookup returns the physical address for the given virtual address.
 //
+// +checkescape:hard,stack
+//
 //go:nosplit
 func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
 	mask := uintptr(usermem.PageSize - 1)
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index e82d6cd1e..60c9896fc 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -34,6 +34,7 @@ go_library(
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/control",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index b49433326..c11e82c10 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -555,7 +555,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
 		if uint64(src.NumBytes()) != srcs.NumBytes() {
 			return 0, nil
 		}
-		if srcs.IsEmpty() {
+		if srcs.IsEmpty() && len(controlBuf) == 0 {
 			return 0, nil
 		}
 
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 677743113..027add1fd 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -35,6 +36,7 @@ import (
 type socketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
 
 	// We store metadata for hostinet sockets internally. Technically, we should
 	// access metadata (e.g. through stat, chmod) on the host for correctness,
@@ -59,6 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in
 			fd:       fd,
 		},
 	}
+	s.LockFD.Init(&lock.FileLocks{})
 	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
 		return nil, syserr.FromError(err)
 	}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 789bb94c8..66015e2bc 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -64,6 +64,8 @@ const enableLogging = false
 var emptyFilter = stack.IPHeaderFilter{
 	Dst:     "\x00\x00\x00\x00",
 	DstMask: "\x00\x00\x00\x00",
+	Src:     "\x00\x00\x00\x00",
+	SrcMask: "\x00\x00\x00\x00",
 }
 
 // nflog logs messages related to the writing and reading of iptables.
@@ -142,31 +144,27 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen
 }
 
 func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) {
-	ipt := stk.IPTables()
-	table, ok := ipt.Tables[tablename.String()]
+	table, ok := stk.IPTables().GetTable(tablename.String())
 	if !ok {
 		return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename)
 	}
 	return table, nil
 }
 
-// FillDefaultIPTables sets stack's IPTables to the default tables and
-// populates them with metadata.
-func FillDefaultIPTables(stk *stack.Stack) {
-	ipt := stack.DefaultTables()
-
-	// In order to fill in the metadata, we have to translate ipt from its
-	// netstack format to Linux's giant-binary-blob format.
-	for name, table := range ipt.Tables {
-		_, metadata, err := convertNetstackToBinary(name, table)
-		if err != nil {
-			panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+// FillIPTablesMetadata populates stack's IPTables with metadata.
+func FillIPTablesMetadata(stk *stack.Stack) {
+	stk.IPTables().ModifyTables(func(tables map[string]stack.Table) {
+		// In order to fill in the metadata, we have to translate ipt from its
+		// netstack format to Linux's giant-binary-blob format.
+		for name, table := range tables {
+			_, metadata, err := convertNetstackToBinary(name, table)
+			if err != nil {
+				panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+			}
+			table.SetMetadata(metadata)
+			tables[name] = table
 		}
-		table.SetMetadata(metadata)
-		ipt.Tables[name] = table
-	}
-
-	stk.SetIPTables(ipt)
+	})
 }
 
 // convertNetstackToBinary converts the iptables as stored in netstack to the
@@ -214,11 +212,16 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI
 		}
 		copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst)
 		copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask)
+		copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src)
+		copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask)
 		copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface)
 		copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
 		if rule.Filter.DstInvert {
 			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP
 		}
+		if rule.Filter.SrcInvert {
+			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP
+		}
 		if rule.Filter.OutputInterfaceInvert {
 			entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
 		}
@@ -566,15 +569,13 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
 	// - There are no chains without an unconditional final rule.
 	// - There are no chains without an unconditional underflow rule.
 
-	ipt := stk.IPTables()
 	table.SetMetadata(metadata{
 		HookEntry:  replace.HookEntry,
 		Underflow:  replace.Underflow,
 		NumEntries: replace.NumEntries,
 		Size:       replace.Size,
 	})
-	ipt.Tables[replace.Name.String()] = table
-	stk.SetIPTables(ipt)
+	stk.IPTables().ReplaceTable(replace.Name.String(), table)
 
 	return nil
 }
@@ -737,6 +738,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 	if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
 		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
 	}
+	if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
+		return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
+	}
 
 	n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0)
 	if n == -1 {
@@ -755,6 +759,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
 		Dst:                   tcpip.Address(iptip.Dst[:]),
 		DstMask:               tcpip.Address(iptip.DstMask[:]),
 		DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
+		Src:                   tcpip.Address(iptip.Src[:]),
+		SrcMask:               tcpip.Address(iptip.SrcMask[:]),
+		SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
 		OutputInterface:       ifname,
 		OutputInterfaceMask:   ifnameMask,
 		OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
@@ -765,15 +772,13 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool {
 	// The following features are supported:
 	// - Protocol
 	// - Dst and DstMask
+	// - Src and SrcMask
 	// - The inverse destination IP check flag
 	// - OutputInterface, OutputInterfaceMask and its inverse.
-	var emptyInetAddr = linux.InetAddr{}
 	var emptyInterface = [linux.IFNAMSIZ]byte{}
 	// Disable any supported inverse flags.
-	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_VIA_OUT)
-	return iptip.Src != emptyInetAddr ||
-		iptip.SrcMask != emptyInetAddr ||
-		iptip.InputInterface != emptyInterface ||
+	inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT)
+	return iptip.InputInterface != emptyInterface ||
 		iptip.InputInterfaceMask != emptyInterface ||
 		iptip.Flags != 0 ||
 		iptip.InverseFlags&^inverseMask != 0
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index 3863293c7..1b4e0ad79 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	// Support only for OUTPUT chain.
 	// TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also.
 	if hook != stack.Output {
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 57a1e1c12..4f98ee2d5 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -96,7 +96,7 @@ func (*TCPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	if netHeader.TransportProtocol() != header.TCPProtocolNumber {
@@ -111,36 +111,10 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the stack.Check codepath as matchers are
-	// added.
-	var tcpHeader header.TCP
-	if pkt.TransportHeader != nil {
-		tcpHeader = header.TCP(pkt.TransportHeader)
-	} else {
-		var length int
-		if hook == stack.Prerouting {
-			// The network header hasn't been parsed yet. We have to do it here.
-			hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-			if !ok {
-				// There's no valid TCP header here, so we hotdrop the
-				// packet.
-				return false, true
-			}
-			h := header.IPv4(hdr)
-			pkt.NetworkHeader = hdr
-			length = int(h.HeaderLength())
-		}
-		// The TCP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize)
-		if !ok {
-			// There's no valid TCP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		tcpHeader = header.TCP(hdr[length:])
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if len(tcpHeader) < header.TCPMinimumSize {
+		// There's no valid TCP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index cfa9e621d..3f20fc891 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -93,7 +93,7 @@ func (*UDPMatcher) Name() string {
 }
 
 // Match implements Matcher.Match.
-func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) {
+func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
 	netHeader := header.IPv4(pkt.NetworkHeader)
 
 	// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
@@ -110,36 +110,10 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa
 		return false, false
 	}
 
-	// Now we need the transport header. However, this may not have been set
-	// yet.
-	// TODO(gvisor.dev/issue/170): Parsing the transport header should
-	// ultimately be moved into the stack.Check codepath as matchers are
-	// added.
-	var udpHeader header.UDP
-	if pkt.TransportHeader != nil {
-		udpHeader = header.UDP(pkt.TransportHeader)
-	} else {
-		var length int
-		if hook == stack.Prerouting {
-			// The network header hasn't been parsed yet. We have to do it here.
-			hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-			if !ok {
-				// There's no valid UDP header here, so we hotdrop the
-				// packet.
-				return false, true
-			}
-			h := header.IPv4(hdr)
-			pkt.NetworkHeader = hdr
-			length = int(h.HeaderLength())
-		}
-		// The UDP header hasn't been parsed yet. We have to do it here.
-		hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize)
-		if !ok {
-			// There's no valid UDP header here, so we hotdrop the
-			// packet.
-			return false, true
-		}
-		udpHeader = header.UDP(hdr[length:])
+	udpHeader := header.UDP(pkt.TransportHeader)
+	if len(udpHeader) < header.UDPMinimumSize {
+		// There's no valid UDP header here, so we drop the packet immediately.
+		return false, true
 	}
 
 	// Check whether the source and destination ports are within the
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 7212d8644..420e573c9 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -29,6 +29,7 @@ go_library(
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go
index b854bf990..8bfee5193 100644
--- a/pkg/sentry/socket/netlink/socket_vfs2.go
+++ b/pkg/sentry/socket/netlink/socket_vfs2.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -40,6 +41,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -66,7 +68,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 		return nil, err
 	}
 
-	return &SocketVFS2{
+	fd := &SocketVFS2{
 		socketOpsCommon: socketOpsCommon{
 			ports:          t.Kernel().NetlinkPorts(),
 			protocol:       protocol,
@@ -75,7 +77,9 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV
 			connection:     connection,
 			sendBufferSize: defaultSendBufferSize,
 		},
-	}, nil
+	}
+	fd.LockFD.Init(&lock.FileLocks{})
+	return fd, nil
 }
 
 // Readiness implements waiter.Waitable.Readiness.
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 333e0042e..0f592ecc3 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -37,6 +37,7 @@ go_library(
         "//pkg/sentry/socket/netfilter",
         "//pkg/sentry/unimpl",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
@@ -50,5 +51,6 @@ go_library(
         "//pkg/tcpip/transport/udp",
         "//pkg/usermem",
         "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 9dea2b5ff..738277391 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -33,6 +33,7 @@ import (
 	"syscall"
 	"time"
 
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/amutex"
 	"gvisor.dev/gvisor/pkg/binary"
@@ -719,6 +720,14 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
 	defer s.EventUnregister(&e)
 
 	if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting {
+		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
+			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
+			// find an available local ephemeral port.
+			if err == tcpip.ErrNoPortAvailable {
+				return syserr.ErrAddressNotAvailable
+			}
+		}
+
 		return syserr.TranslateNetstackError(err)
 	}
 
@@ -1237,6 +1246,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
 
 		return int32(time.Duration(v) / time.Second), nil
 
+	case linux.TCP_KEEPCNT:
+		if outLen < sizeOfInt32 {
+			return nil, syserr.ErrInvalidArgument
+		}
+
+		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
+		if err != nil {
+			return nil, syserr.TranslateNetstackError(err)
+		}
+
+		return int32(v), nil
+
 	case linux.TCP_USER_TIMEOUT:
 		if outLen < sizeOfInt32 {
 			return nil, syserr.ErrInvalidArgument
@@ -1777,6 +1798,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
 		}
 		return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
 
+	case linux.TCP_KEEPCNT:
+		if len(optVal) < sizeOfInt32 {
+			return syserr.ErrInvalidArgument
+		}
+
+		v := usermem.ByteOrder.Uint32(optVal)
+		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
+			return syserr.ErrInvalidArgument
+		}
+		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
+
 	case linux.TCP_USER_TIMEOUT:
 		if len(optVal) < sizeOfInt32 {
 			return syserr.ErrInvalidArgument
@@ -2106,30 +2138,20 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) {
 	switch name {
 	case linux.TCP_CONGESTION,
 		linux.TCP_CORK,
-		linux.TCP_DEFER_ACCEPT,
 		linux.TCP_FASTOPEN,
 		linux.TCP_FASTOPEN_CONNECT,
 		linux.TCP_FASTOPEN_KEY,
 		linux.TCP_FASTOPEN_NO_COOKIE,
-		linux.TCP_KEEPCNT,
-		linux.TCP_KEEPIDLE,
-		linux.TCP_KEEPINTVL,
-		linux.TCP_LINGER2,
-		linux.TCP_MAXSEG,
 		linux.TCP_QUEUE_SEQ,
-		linux.TCP_QUICKACK,
 		linux.TCP_REPAIR,
 		linux.TCP_REPAIR_QUEUE,
 		linux.TCP_REPAIR_WINDOW,
 		linux.TCP_SAVED_SYN,
 		linux.TCP_SAVE_SYN,
-		linux.TCP_SYNCNT,
 		linux.TCP_THIN_DUPACK,
 		linux.TCP_THIN_LINEAR_TIMEOUTS,
 		linux.TCP_TIMESTAMP,
-		linux.TCP_ULP,
-		linux.TCP_USER_TIMEOUT,
-		linux.TCP_WINDOW_CLAMP:
+		linux.TCP_ULP:
 
 		t.Kernel().EmitUnimplementedEvent(t)
 	}
@@ -2718,7 +2740,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy
 			v = math.MaxInt32
 		}
 
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -2787,7 +2809,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 		if v > math.MaxInt32 {
 			v = math.MaxInt32
 		}
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
@@ -2803,7 +2825,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc
 			v = math.MaxInt32
 		}
 
-		// Copy result to user-space.
+		// Copy result to userspace.
 		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
 			AddressSpaceActive: true,
 		})
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index fcd8013c0..1412a4810 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -38,6 +39,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -64,6 +66,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu
 			protocol: protocol,
 		},
 	}
+	s.LockFD.Init(&lock.FileLocks{})
 	vfsfd := &s.vfsfd
 	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index f5fa18136..9b44c2b89 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -362,14 +362,13 @@ func (s *Stack) RouteTable() []inet.Route {
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() (stack.IPTables, error) {
+func (s *Stack) IPTables() (*stack.IPTables, error) {
 	return s.Stack.IPTables(), nil
 }
 
-// FillDefaultIPTables sets the stack's iptables to the default tables, which
-// allow and do not modify all traffic.
-func (s *Stack) FillDefaultIPTables() {
-	netfilter.FillDefaultIPTables(s.Stack)
+// FillIPTablesMetadata populates stack's IPTables with metadata.
+func (s *Stack) FillIPTablesMetadata() {
+	netfilter.FillIPTablesMetadata(s.Stack)
 }
 
 // Resume implements inet.Stack.Resume.
diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD
index de2cc4bdf..7d4cc80fe 100644
--- a/pkg/sentry/socket/unix/BUILD
+++ b/pkg/sentry/socket/unix/BUILD
@@ -29,6 +29,7 @@ go_library(
         "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/lock",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/tcpip",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index ce5b94ee7..09c6d3b27 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -252,7 +252,7 @@ func (e *connectionedEndpoint) Close() {
 // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
 func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
 	if ce.Type() != e.stype {
-		return syserr.ErrConnectionRefused
+		return syserr.ErrWrongProtocolForSocket
 	}
 
 	// Check if ce is e to avoid a deadlock.
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 5b29e9d7f..4bb2b6ff4 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -417,7 +417,18 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool
 	defer ep.Release()
 
 	// Connect the server endpoint.
-	return s.ep.Connect(t, ep)
+	err = s.ep.Connect(t, ep)
+
+	if err == syserr.ErrWrongProtocolForSocket {
+		// Linux for abstract sockets returns ErrConnectionRefused
+		// instead of ErrWrongProtocolForSocket.
+		path, _ := extractPath(sockaddr)
+		if len(path) > 0 && path[0] == 0 {
+			err = syserr.ErrConnectionRefused
+		}
+	}
+
+	return err
 }
 
 // Write implements fs.FileOperations.Write.
@@ -448,15 +459,25 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b
 		To:       nil,
 	}
 	if len(to) > 0 {
-		ep, err := extractEndpoint(t, to)
-		if err != nil {
-			return 0, err
-		}
-		defer ep.Release()
-		w.To = ep
+		switch s.stype {
+		case linux.SOCK_SEQPACKET:
+			to = nil
+		case linux.SOCK_STREAM:
+			if s.State() == linux.SS_CONNECTED {
+				return 0, syserr.ErrAlreadyConnected
+			}
+			return 0, syserr.ErrNotSupported
+		default:
+			ep, err := extractEndpoint(t, to)
+			if err != nil {
+				return 0, err
+			}
+			defer ep.Release()
+			w.To = ep
 
-		if ep.Passcred() && w.Control.Credentials == nil {
-			w.Control.Credentials = control.MakeCreds(t)
+			if ep.Passcred() && w.Control.Credentials == nil {
+				w.Control.Credentials = control.MakeCreds(t)
+			}
 		}
 	}
 
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 45e109361..8c32371a2 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -26,6 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/syserr"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -39,6 +40,7 @@ type SocketVFS2 struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
+	vfs.LockFD
 
 	socketOpsCommon
 }
@@ -51,7 +53,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
 	mnt := t.Kernel().SocketMount()
 	d := sockfs.NewDentry(t.Credentials(), mnt)
 
-	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d)
+	fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &lock.FileLocks{})
 	if err != nil {
 		return nil, syserr.FromError(err)
 	}
@@ -60,7 +62,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType)
 
 // NewFileDescription creates and returns a socket file description
 // corresponding to the given mount and dentry.
-func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) {
+func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *lock.FileLocks) (*vfs.FileDescription, error) {
 	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
 	// SOCK_DGRAM and don't require CAP_NET_RAW.
 	if stype == linux.SOCK_RAW {
@@ -73,6 +75,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
 			stype: stype,
 		},
 	}
+	sock.LockFD.Init(locks)
 	vfsfd := &sock.vfsfd
 	if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
 		DenyPRead:         true,
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 35a98212a..8347617bd 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -998,9 +998,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			return 0, nil, err
 		}
 
-		// The lock uid is that of the Task's FDTable.
-		lockUniqueID := lock.UniqueID(t.FDTable().ID())
-
 		// These locks don't block; execute the non-blocking operation using the inode's lock
 		// context directly.
 		switch flock.Type {
@@ -1010,12 +1007,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 			if cmd == linux.F_SETLK {
 				// Non-blocking lock, provide a nil lock.Blocker.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) {
 					return 0, nil, syserror.EAGAIN
 				}
 			} else {
 				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) {
 					return 0, nil, syserror.EINTR
 				}
 			}
@@ -1026,18 +1023,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 			}
 			if cmd == linux.F_SETLK {
 				// Non-blocking lock, provide a nil lock.Blocker.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) {
 					return 0, nil, syserror.EAGAIN
 				}
 			} else {
 				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
-				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+				if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) {
 					return 0, nil, syserror.EINTR
 				}
 			}
 			return 0, nil, nil
 		case linux.F_UNLCK:
-			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng)
+			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
 			return 0, nil, nil
 		default:
 			return 0, nil, syserror.EINVAL
@@ -2157,22 +2154,6 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	nonblocking := operation&linux.LOCK_NB != 0
 	operation &^= linux.LOCK_NB
 
-	// flock(2):
-	// Locks created by flock() are associated with an open file table entry. This means that
-	// duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the
-	// same lock, and this lock may be modified or released using any of these descriptors. Furthermore,
-	// the lock is released either by an explicit LOCK_UN operation on any of these duplicate
-	// descriptors, or when all such descriptors have been closed.
-	//
-	// If a process uses open(2) (or similar) to obtain more than one descriptor for the same file,
-	// these descriptors are treated independently by flock(). An attempt to lock the file using
-	// one of these file descriptors may be denied by a lock that the calling process has already placed via
-	// another descriptor.
-	//
-	// We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2)
-	// and fork(2).
-	lockUniqueID := lock.UniqueID(file.UniqueID)
-
 	// A BSD style lock spans the entire file.
 	rng := lock.LockRange{
 		Start: 0,
@@ -2183,29 +2164,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	case linux.LOCK_EX:
 		if nonblocking {
 			// Since we're nonblocking we pass a nil lock.Blocker implementation.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) {
 				return 0, nil, syserror.EWOULDBLOCK
 			}
 		} else {
 			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) {
 				return 0, nil, syserror.EINTR
 			}
 		}
 	case linux.LOCK_SH:
 		if nonblocking {
 			// Since we're nonblocking we pass a nil lock.Blocker implementation.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) {
 				return 0, nil, syserror.EWOULDBLOCK
 			}
 		} else {
 			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
-			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) {
+			if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) {
 				return 0, nil, syserror.EINTR
 			}
 		}
 	case linux.LOCK_UN:
-		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng)
+		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
 	default:
 		// flock(2): EINVAL operation is invalid.
 		return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index df0d0f461..77c78889d 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -16,7 +16,6 @@ package linux
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -26,7 +25,6 @@ import (
 
 // doSplice implements a blocking splice operation.
 func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) {
-	log.Infof("NLAC: doSplice opts: %+v", opts)
 	if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 || (opts.SrcStart+opts.Length < 0) {
 		return 0, syserror.EINVAL
 	}
@@ -82,6 +80,12 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
 		}
 	}
 
+	if total > 0 {
+		// On Linux, inotify behavior is not very consistent with splice(2). We try
+		// our best to emulate Linux for very basic calls to splice, where for some
+		// reason, events are generated for output files, but not input files.
+		outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
+	}
 	return total, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
index 2de5e3422..c24946160 100644
--- a/pkg/sentry/syscalls/linux/sys_xattr.go
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -207,7 +207,11 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.SetXattr(t, d, name, value, flags)
+	if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
 
 func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
@@ -418,7 +422,11 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
 		return syserror.EOPNOTSUPP
 	}
 
-	return d.Inode.RemoveXattr(t, d, name)
+	if err := d.Inode.RemoveXattr(t, d, name); err != nil {
+		return err
+	}
+	d.InotifyEvent(linux.IN_ATTRIB, 0)
+	return nil
 }
 
 // LINT.ThenChange(vfs2/xattr.go)
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index f882ef840..9f93f4354 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -12,9 +12,12 @@ go_library(
         "filesystem.go",
         "fscontext.go",
         "getdents.go",
+        "inotify.go",
         "ioctl.go",
+        "lock.go",
         "memfd.go",
         "mmap.go",
+        "mount.go",
         "path.go",
         "pipe.go",
         "poll.go",
@@ -22,6 +25,7 @@ go_library(
         "setstat.go",
         "signal.go",
         "socket.go",
+        "splice.go",
         "stat.go",
         "stat_amd64.go",
         "stat_arm64.go",
@@ -39,6 +43,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/gohacks",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
         "//pkg/sentry/fsimpl/eventfd",
         "//pkg/sentry/fsimpl/pipefs",
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index ca0f7fd1e..6006758a5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -168,7 +168,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 		err := tmpfs.AddSeals(file, args[2].Uint())
 		return 0, nil, err
 	default:
-		// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
+		// TODO(gvisor.dev/issue/2920): Everything else is not yet supported.
 		return 0, nil, syserror.EINVAL
 	}
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..7d50b6a16
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+	if flags&^allFlags != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+	if err != nil {
+		return 0, nil, err
+	}
+	defer ino.DecRef()
+
+	fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+		CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+	})
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	args[0].Value = 0
+	return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+	f := t.GetFileVFS2(fd)
+	if f == nil {
+		// Invalid fd.
+		return nil, nil, syserror.EBADF
+	}
+
+	ino, ok := f.Impl().(*vfs.Inotify)
+	if !ok {
+		// Not an inotify fd.
+		f.DecRef()
+		return nil, nil, syserror.EINVAL
+	}
+
+	return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	addr := args[1].Pointer()
+	mask := args[2].Uint()
+
+	// "EINVAL: The given event mask contains no valid events."
+	// -- inotify_add_watch(2)
+	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+	//  -- inotify(7)
+	follow := followFinalSymlink
+	if mask&linux.IN_DONT_FOLLOW == 0 {
+		follow = nofollowFinalSymlink
+	}
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	if mask&linux.IN_ONLYDIR != 0 {
+		path.Dir = true
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+	d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		return 0, nil, err
+	}
+	defer d.DecRef()
+
+	fd = ino.AddWatch(d.Dentry(), mask)
+	return uintptr(fd), nil, err
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	wd := args[1].Int()
+
+	ino, f, err := fdToInotify(t, fd)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer f.DecRef()
+	return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go
new file mode 100644
index 000000000..bf19028c4
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/lock.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Flock implements linux syscall flock(2).
+func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	fd := args[0].Int()
+	operation := args[1].Int()
+
+	file := t.GetFileVFS2(fd)
+	if file == nil {
+		// flock(2): EBADF fd is not an open file descriptor.
+		return 0, nil, syserror.EBADF
+	}
+	defer file.DecRef()
+
+	nonblocking := operation&linux.LOCK_NB != 0
+	operation &^= linux.LOCK_NB
+
+	var blocker lock.Blocker
+	if !nonblocking {
+		blocker = t
+	}
+
+	switch operation {
+	case linux.LOCK_EX:
+		if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_SH:
+		if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil {
+			return 0, nil, err
+		}
+	case linux.LOCK_UN:
+		if err := file.UnlockBSD(t); err != nil {
+			return 0, nil, err
+		}
+	default:
+		// flock(2): EINVAL operation is invalid.
+		return 0, nil, syserror.EINVAL
+	}
+
+	return 0, nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go
new file mode 100644
index 000000000..adeaa39cc
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mount.go
@@ -0,0 +1,145 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Mount implements Linux syscall mount(2).
+func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	sourceAddr := args[0].Pointer()
+	targetAddr := args[1].Pointer()
+	typeAddr := args[2].Pointer()
+	flags := args[3].Uint64()
+	dataAddr := args[4].Pointer()
+
+	// For null-terminated strings related to mount(2), Linux copies in at most
+	// a page worth of data. See fs/namespace.c:copy_mount_string().
+	fsType, err := t.CopyInString(typeAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+	source, err := t.CopyInString(sourceAddr, usermem.PageSize)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	targetPath, err := copyInPath(t, targetAddr)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	data := ""
+	if dataAddr != 0 {
+		// In Linux, a full page is always copied in regardless of null
+		// character placement, and the address is passed to each file system.
+		// Most file systems always treat this data as a string, though, and so
+		// do all of the ones we implement.
+		data, err = t.CopyInString(dataAddr, usermem.PageSize)
+		if err != nil {
+			return 0, nil, err
+		}
+	}
+
+	// Ignore magic value that was required before Linux 2.4.
+	if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
+		flags = flags &^ linux.MS_MGC_MSK
+	}
+
+	// Must have CAP_SYS_ADMIN in the current mount namespace's associated user
+	// namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
+		linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
+		linux.MS_UNBINDABLE | linux.MS_MOVE
+
+	// Silently allow MS_NOSUID, since we don't implement set-id bits
+	// anyway.
+	const unsupportedFlags = linux.MS_NODEV |
+		linux.MS_NODIRATIME | linux.MS_STRICTATIME
+
+	// Linux just allows passing any flags to mount(2) - it won't fail when
+	// unknown or unsupported flags are passed. Since we don't implement
+	// everything, we fail explicitly on flags that are unimplemented.
+	if flags&(unsupportedOps|unsupportedFlags) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	var opts vfs.MountOptions
+	if flags&linux.MS_NOATIME == linux.MS_NOATIME {
+		opts.Flags.NoATime = true
+	}
+	if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
+		opts.Flags.NoExec = true
+	}
+	if flags&linux.MS_RDONLY == linux.MS_RDONLY {
+		opts.ReadOnly = true
+	}
+	opts.GetFilesystemOptions.Data = data
+
+	target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer target.Release()
+
+	return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
+}
+
+// Umount2 implements Linux syscall umount2(2).
+func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	flags := args[1].Int()
+
+	// Must have CAP_SYS_ADMIN in the mount namespace's associated user
+	// namespace.
+	//
+	// Currently, this is always the init task's user namespace.
+	creds := t.Credentials()
+	if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
+		return 0, nil, syserror.EPERM
+	}
+
+	const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
+	if flags&unsupported != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	path, err := copyInPath(t, addr)
+	if err != nil {
+		return 0, nil, err
+	}
+	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
+	if err != nil {
+		return 0, nil, err
+	}
+	defer tpop.Release()
+
+	opts := vfs.UmountOptions{
+		Flags: uint32(flags),
+	}
+
+	return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 3a7ef24f5..7f9debd4a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	n, err := file.Read(t, dst, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
 func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	n, err := file.PRead(t, dst, offset, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
 func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	n, err := file.Write(t, src, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
@@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	n, err := file.PWrite(t, src, offset, opts)
 	if err != syserror.ErrWouldBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
 	allowBlock, deadline, hasDeadline := blockPolicy(t, file)
 	if !allowBlock {
+		if n > 0 {
+			file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+		}
 		return n, err
 	}
 
@@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
 	}
 	file.EventUnregister(&w)
 
+	if total > 0 {
+		file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent)
+	}
 	return total, err
 }
 
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
new file mode 100644
index 000000000..945a364a7
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -0,0 +1,291 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Splice implements Linux syscall splice(2).
+func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	inOffsetPtr := args[1].Pointer()
+	outFD := args[2].Int()
+	outOffsetPtr := args[3].Pointer()
+	count := int64(args[4].SizeT())
+	flags := args[5].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// At least one file description must represent a pipe.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe && !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy in offsets.
+	inOffset := int64(-1)
+	if inOffsetPtr != 0 {
+		if inIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if inFile.Options().DenyPRead {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+		if inOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+	outOffset := int64(-1)
+	if outOffsetPtr != 0 {
+		if outIsPipe {
+			return 0, nil, syserror.ESPIPE
+		}
+		if outFile.Options().DenyPWrite {
+			return 0, nil, syserror.EINVAL
+		}
+		if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+		if outOffset < 0 {
+			return 0, nil, syserror.EINVAL
+		}
+	}
+
+	// Move data.
+	var (
+		n     int64
+		err   error
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		// If both input and output are pipes, delegate to the pipe
+		// implementation. Otherwise, exactly one end is a pipe, which we
+		// ensure is consistently ordered after the non-pipe FD's locks by
+		// passing the pipe FD as usermem.IO to the non-pipe end.
+		switch {
+		case inIsPipe && outIsPipe:
+			n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
+		case inIsPipe:
+			if outOffset != -1 {
+				n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{})
+				outOffset += n
+			} else {
+				n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{})
+			}
+		case outIsPipe:
+			if inOffset != -1 {
+				n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{})
+				inOffset += n
+			} else {
+				n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{})
+			}
+		}
+		if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
+			break
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the splice operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(inCh); err != nil {
+				break
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err = t.Block(outCh); err != nil {
+				break
+			}
+		}
+	}
+
+	// Copy updated offsets out.
+	if inOffsetPtr != 0 {
+		if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+	if outOffsetPtr != 0 {
+		if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	if n == 0 {
+		return 0, nil, err
+	}
+
+	// On Linux, inotify behavior is not very consistent with splice(2). We try
+	// our best to emulate Linux for very basic calls to splice, where for some
+	// reason, events are generated for output files, but not input files.
+	outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent)
+	return uintptr(n), nil, nil
+}
+
+// Tee implements Linux syscall tee(2).
+func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	inFD := args[0].Int()
+	outFD := args[1].Int()
+	count := int64(args[2].SizeT())
+	flags := args[3].Int()
+
+	if count == 0 {
+		return 0, nil, nil
+	}
+	if count > int64(kernel.MAX_RW_COUNT) {
+		count = int64(kernel.MAX_RW_COUNT)
+	}
+
+	// Check for invalid flags.
+	if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Get file descriptions.
+	inFile := t.GetFileVFS2(inFD)
+	if inFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer inFile.DecRef()
+	outFile := t.GetFileVFS2(outFD)
+	if outFile == nil {
+		return 0, nil, syserror.EBADF
+	}
+	defer outFile.DecRef()
+
+	// Check that both files support the required directionality.
+	if !inFile.IsReadable() || !outFile.IsWritable() {
+		return 0, nil, syserror.EBADF
+	}
+
+	// The operation is non-blocking if anything is non-blocking.
+	//
+	// N.B. This is a rather simplistic heuristic that avoids some
+	// poor edge case behavior since the exact semantics here are
+	// underspecified and vary between versions of Linux itself.
+	nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)
+
+	// Both file descriptions must represent pipes.
+	inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
+	outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
+	if !inIsPipe || !outIsPipe {
+		return 0, nil, syserror.EINVAL
+	}
+
+	// Copy data.
+	var (
+		inCh  chan struct{}
+		outCh chan struct{}
+	)
+	for {
+		n, err := pipe.Tee(t, outPipeFD, inPipeFD, count)
+		if n != 0 {
+			return uintptr(n), nil, nil
+		}
+		if err != syserror.ErrWouldBlock || nonBlock {
+			return 0, nil, err
+		}
+
+		// Note that the blocking behavior here is a bit different than the
+		// normal pattern. Because we need to have both data to read and data
+		// to write simultaneously, we actually explicitly block on both of
+		// these cases in turn before returning to the tee operation.
+		if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
+			if inCh == nil {
+				inCh = make(chan struct{}, 1)
+				inW, _ := waiter.NewChannelEntry(inCh)
+				inFile.EventRegister(&inW, eventMaskRead)
+				defer inFile.EventUnregister(&inW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(inCh); err != nil {
+				return 0, nil, err
+			}
+		}
+		if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
+			if outCh == nil {
+				outCh = make(chan struct{}, 1)
+				outW, _ := waiter.NewChannelEntry(outCh)
+				outFile.EventRegister(&outW, eventMaskWrite)
+				defer outFile.EventUnregister(&outW)
+				continue // Need to refresh readiness.
+			}
+			if err := t.Block(outCh); err != nil {
+				return 0, nil, err
+			}
+		}
+	}
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index a332d01bd..954c82f97 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -62,7 +62,7 @@ func Override() {
 	s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt)
 	s.Table[59] = syscalls.Supported("execve", Execve)
 	s.Table[72] = syscalls.Supported("fcntl", Fcntl)
-	delete(s.Table, 73) // flock
+	s.Table[73] = syscalls.Supported("fcntl", Flock)
 	s.Table[74] = syscalls.Supported("fsync", Fsync)
 	s.Table[75] = syscalls.Supported("fdatasync", Fdatasync)
 	s.Table[76] = syscalls.Supported("truncate", Truncate)
@@ -90,8 +90,8 @@ func Override() {
 	s.Table[138] = syscalls.Supported("fstatfs", Fstatfs)
 	s.Table[161] = syscalls.Supported("chroot", Chroot)
 	s.Table[162] = syscalls.Supported("sync", Sync)
-	delete(s.Table, 165) // mount
-	delete(s.Table, 166) // umount2
+	s.Table[165] = syscalls.Supported("mount", Mount)
+	s.Table[166] = syscalls.Supported("umount2", Umount2)
 	delete(s.Table, 187) // readahead
 	s.Table[188] = syscalls.Supported("setxattr", Setxattr)
 	s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
@@ -116,9 +116,9 @@ func Override() {
 	s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
 	s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
 	s.Table[235] = syscalls.Supported("utimes", Utimes)
-	delete(s.Table, 253) // inotify_init
-	delete(s.Table, 254) // inotify_add_watch
-	delete(s.Table, 255) // inotify_rm_watch
+	s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+	s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+	s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
 	s.Table[257] = syscalls.Supported("openat", Openat)
 	s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
 	s.Table[259] = syscalls.Supported("mknodat", Mknodat)
@@ -134,8 +134,8 @@ func Override() {
 	s.Table[269] = syscalls.Supported("faccessat", Faccessat)
 	s.Table[270] = syscalls.Supported("pselect", Pselect)
 	s.Table[271] = syscalls.Supported("ppoll", Ppoll)
-	delete(s.Table, 275) // splice
-	delete(s.Table, 276) // tee
+	s.Table[275] = syscalls.Supported("splice", Splice)
+	s.Table[276] = syscalls.Supported("tee", Tee)
 	s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
 	s.Table[280] = syscalls.Supported("utimensat", Utimensat)
 	s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
@@ -151,7 +151,7 @@ func Override() {
 	s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
 	s.Table[292] = syscalls.Supported("dup3", Dup3)
 	s.Table[293] = syscalls.Supported("pipe2", Pipe2)
-	delete(s.Table, 294) // inotify_init1
+	s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
 	s.Table[295] = syscalls.Supported("preadv", Preadv)
 	s.Table[296] = syscalls.Supported("pwritev", Pwritev)
 	s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s
index 5ad57a8a3..8afc62d53 100644
--- a/pkg/sentry/time/muldiv_arm64.s
+++ b/pkg/sentry/time/muldiv_arm64.s
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "funcdata.h"
 #include "textflag.h"
 
 // Documentation is available in parameters.go.
 //
 // func muldiv64(value, multiplier, divisor uint64) (uint64, bool)
 TEXT ·muldiv64(SB),NOSPLIT,$40-33
+    GO_ARGS
+    NO_LOCAL_POINTERS
     MOVD    value+0(FP), R0
     MOVD    multiplier+8(FP), R1
     MOVD    divisor+16(FP), R2
diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go
index e1b9084ac..0ce1257f6 100644
--- a/pkg/sentry/time/parameters_test.go
+++ b/pkg/sentry/time/parameters_test.go
@@ -484,3 +484,18 @@ func TestMulDivOverflow(t *testing.T) {
 		})
 	}
 }
+
+func BenchmarkMuldiv64(b *testing.B) {
+	var v uint64 = math.MaxUint64
+	for i := uint64(1); i <= 1000000; i++ {
+		mult := uint64(1000000000)
+		div := i * mult
+		res, ok := muldiv64(v, mult, div)
+		if !ok {
+			b.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div)
+		}
+		if want := v / i; res != want {
+			b.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want)
+		}
+	}
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 94d69c1cc..16d9f3a28 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "event_list",
+    out = "event_list.go",
+    package = "vfs",
+    prefix = "event",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Event",
+        "Linker": "*Event",
+    },
+)
+
 go_library(
     name = "vfs",
     srcs = [
@@ -25,11 +37,13 @@ go_library(
         "device.go",
         "epoll.go",
         "epoll_interest_list.go",
+        "event_list.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
         "filesystem_impl_util.go",
         "filesystem_type.go",
+        "inotify.go",
         "mount.go",
         "mount_unsafe.go",
         "options.go",
@@ -57,6 +71,8 @@ go_library(
         "//pkg/sentry/limits",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/vfs/lock",
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..66f3105bd 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
 -   Mount: Each referenced Mount holds a reference on its parent, which is the
     mount containing its mount point.
 
--   VirtualFilesystem: A reference is held on each Mount that has not been
-    umounted.
+-   VirtualFilesystem: A reference is held on each Mount that has been connected
+    to a mount point, but not yet umounted.
 
 MountNamespace and FileDescription references are held by users of VFS. The
 expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index caf770fd5..b7c6b60b8 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool {
 func (d *anonDentry) DecRef() {
 	// no-op
 }
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) Watches() *Watches {
+	return nil
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 8624dbd5d..24af13eb1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -103,6 +103,22 @@ type DentryImpl interface {
 
 	// DecRef decrements the Dentry's reference count.
 	DecRef()
+
+	// InotifyWithParent notifies all watches on the targets represented by this
+	// dentry and its parent. The parent's watches are notified first, followed
+	// by this dentry's.
+	//
+	// InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+	// representing directories.
+	//
+	// Note that the events may not actually propagate up to the user, depending
+	// on the event masks.
+	InotifyWithParent(events uint32, cookie uint32, et EventType)
+
+	// Watches returns the set of inotify watches for the file corresponding to
+	// the Dentry. Dentries that are hard links to the same underlying file
+	// share the same watches.
+	Watches() *Watches
 }
 
 // IncRef increments d's reference count.
@@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool {
 	return atomic.LoadUint32(&d.mounts) != 0
 }
 
+// InotifyWithParent notifies all watches on the inodes for this dentry and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {
+	d.impl.InotifyWithParent(events, cookie, et)
+}
+
+// Watches returns the set of inotify watches associated with d.
+func (d *Dentry) Watches() *Watches {
+	return d.impl.Watches()
+}
+
 // The following functions are exported so that filesystem implementations can
 // use them. The vfs package, and users of VFS, should not call these
 // functions.
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 8297f964b..599c3131c 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -31,6 +31,7 @@ type EpollInstance struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
 	DentryMetadataFileDescriptionImpl
+	NoLockFD
 
 	// q holds waiters on this EpollInstance.
 	q waiter.Queue
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index cfabd936c..97b9b18d7 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -73,6 +73,8 @@ type FileDescription struct {
 	// writable is analogous to Linux's FMODE_WRITE.
 	writable bool
 
+	usedLockBSD uint32
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
@@ -175,6 +177,12 @@ func (fd *FileDescription) DecRef() {
 			}
 			ep.interestMu.Unlock()
 		}
+
+		// If BSD locks were used, release any lock that it may have acquired.
+		if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
+			fd.impl.UnlockBSD(context.Background(), fd)
+		}
+
 		// Release implementation resources.
 		fd.impl.Release()
 		if fd.writable {
@@ -210,6 +218,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
 	return fd.vd
 }
 
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+	return fd.opts
+}
+
 // StatusFlags returns file description status flags, as for fcntl(F_GETFL).
 func (fd *FileDescription) StatusFlags() uint32 {
 	return atomic.LoadUint32(&fd.statusFlags)
@@ -415,13 +428,9 @@ type FileDescriptionImpl interface {
 	Removexattr(ctx context.Context, name string) error
 
 	// LockBSD tries to acquire a BSD-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): BSD-style file locking
 	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
 
-	// LockBSD releases a BSD-style advisory file lock.
-	//
-	// TODO(gvisor.dev/issue/1480): BSD-style file locking
+	// UnlockBSD releases a BSD-style advisory file lock.
 	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
 
 	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
@@ -731,3 +740,14 @@ func (fd *FileDescription) InodeID() uint64 {
 func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
 	return fd.Sync(ctx)
 }
+
+// LockBSD tries to acquire a BSD-style advisory file lock.
+func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
+	atomic.StoreUint32(&fd.usedLockBSD, 1)
+	return fd.impl.LockBSD(ctx, fd, lockType, blocker)
+}
+
+// UnlockBSD releases a BSD-style advisory file lock.
+func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
+	return fd.impl.UnlockBSD(ctx, fd)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index f4c111926..af7213dfd 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -21,8 +21,9 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/lock"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -153,26 +154,6 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
 	return syserror.ENOTSUP
 }
 
-// LockBSD implements FileDescriptionImpl.LockBSD.
-func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
-	return syserror.EBADF
-}
-
-// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
-func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
-	return syserror.EBADF
-}
-
-// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
-func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
-	return syserror.EBADF
-}
-
-// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
-func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
-	return syserror.EBADF
-}
-
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
@@ -384,3 +365,60 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
 	fd.IncRef()
 	return nil
 }
+
+// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
+// functions. Caller must call Init().
+type LockFD struct {
+	locks *lock.FileLocks
+}
+
+// Init initializes fd with FileLocks to use.
+func (fd *LockFD) Init(locks *lock.FileLocks) {
+	fd.locks = locks
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return fd.locks.LockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	fd.locks.UnlockBSD(uid)
+	return nil
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	return fd.locks.LockPOSIX(uid, t, rng, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error {
+	fd.locks.UnlockPOSIX(uid, rng)
+	return nil
+}
+
+// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning ENOLCK.
+type NoLockFD struct{}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	return syserror.ENOLCK
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error {
+	return syserror.ENOLCK
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 3a75d4d62..5061f6ac9 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -33,6 +33,7 @@ import (
 type fileDescription struct {
 	vfsfd FileDescription
 	FileDescriptionDefaultImpl
+	NoLockFD
 }
 
 // genCount contains the number of times its DynamicBytesSource.Generate()
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 286510195..8882fa84a 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -43,7 +43,7 @@ type Dentry struct {
 // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
 // either d2's parent or an ancestor of d2's parent.
 func IsAncestorDentry(d, d2 *Dentry) bool {
-	for {
+	for d2 != nil { // Stop at root, where d2.parent == nil.
 		if d2.parent == d {
 			return true
 		}
@@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool {
 		}
 		d2 = d2.parent
 	}
+	return false
 }
 
 // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..7fa7d2d0c
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,698 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+	PathEvent  EventType = iota
+	InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// Lock ordering:
+//   Inotify.mu -> Watches.mu -> Inotify.evMu
+//
+// +stateify savable
+type Inotify struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+	NoLockFD
+
+	// Unique identifier for this inotify instance. We don't just reuse the
+	// inotify fd because fds can be duped. These should not be exposed to the
+	// user, since we may aggressively reuse an id on S/R.
+	id uint64
+
+	// queue is used to notify interested parties when the inotify instance
+	// becomes readable or writable.
+	queue waiter.Queue `state:"nosave"`
+
+	// evMu *only* protects the events list. We need a separate lock while
+	// queuing events: using mu may violate lock ordering, since at that point
+	// the calling goroutine may already hold Watches.mu.
+	evMu sync.Mutex `state:"nosave"`
+
+	// A list of pending events for this inotify instance. Protected by evMu.
+	events eventList
+
+	// A scratch buffer, used to serialize inotify events. Allocate this
+	// ahead of time for the sake of performance. Protected by evMu.
+	scratch []byte
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+	// instance. Note that Linux starts numbering watch descriptors from 1.
+	nextWatchMinusOne int32
+
+	// Map from watch descriptors to watch objects.
+	watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+	// O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+	flags &^= linux.O_CLOEXEC
+	if flags&^linux.O_NONBLOCK != 0 {
+		return nil, syserror.EINVAL
+	}
+
+	id := uniqueid.GlobalFromContext(ctx)
+	vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+	defer vd.DecRef()
+	fd := &Inotify{
+		id:      id,
+		scratch: make([]byte, inotifyEventBaseSize),
+		watches: make(map[int32]*Watch),
+	}
+	if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+	// We need to hold i.mu to avoid a race with concurrent calls to
+	// Inotify.handleDeletion from Watches. There's no risk of Watches
+	// accessing this Inotify after the destructor ends, because we remove all
+	// references to it below.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	for _, w := range i.watches {
+		// Remove references to the watch from the watches set on the target. We
+		// don't need to worry about the references from i.watches, since this
+		// file description is about to be destroyed.
+		w.set.Remove(i.id)
+	}
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+	i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if !i.events.Empty() {
+		ready |= waiter.EventIn
+	}
+
+	return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if dst.NumBytes() < inotifyEventBaseSize {
+		return 0, syserror.EINVAL
+	}
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if i.events.Empty() {
+		// Nothing to read yet, tell caller to block.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var writeLen int64
+	for it := i.events.Front(); it != nil; {
+		// Advance `it` before the element is removed from the list, or else
+		// it.Next() will always be nil.
+		event := it
+		it = it.Next()
+
+		// Does the buffer have enough remaining space to hold the event we're
+		// about to write out?
+		if dst.NumBytes() < int64(event.sizeOf()) {
+			if writeLen > 0 {
+				// Buffer wasn't big enough for all pending events, but we did
+				// write some events out.
+				return writeLen, nil
+			}
+			return 0, syserror.EINVAL
+		}
+
+		// Linux always dequeues an available event as long as there's enough
+		// buffer space to copy it out, even if the copy below fails. Emulate
+		// this behaviour.
+		i.events.Remove(event)
+
+		// Buffer has enough space, copy event to the read buffer.
+		n, err := event.CopyTo(ctx, i.scratch, dst)
+		if err != nil {
+			return 0, err
+		}
+
+		writeLen += n
+		dst = dst.DropFirst64(n)
+	}
+	return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Int() {
+	case linux.FIONREAD:
+		i.evMu.Lock()
+		defer i.evMu.Unlock()
+		var n uint32
+		for e := i.events.Front(); e != nil; e = e.Next() {
+			n += uint32(e.sizeOf())
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], n)
+		_, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+	i.evMu.Lock()
+
+	// Check if we should coalesce the event we're about to queue with the last
+	// one currently in the queue. Events are coalesced if they are identical.
+	if last := i.events.Back(); last != nil {
+		if ev.equals(last) {
+			// "Coalesce" the two events by simply not queuing the new one. We
+			// don't need to raise a waiter.EventIn notification because no new
+			// data is available for reading.
+			i.evMu.Unlock()
+			return
+		}
+	}
+
+	i.events.PushBack(ev)
+
+	// Release mutex before notifying waiters because we don't control what they
+	// can do.
+	i.evMu.Unlock()
+
+	i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
+	targetWatches := target.Watches()
+	w := &Watch{
+		owner: i,
+		wd:    i.nextWatchIDLocked(),
+		set:   targetWatches,
+		mask:  mask,
+	}
+
+	// Hold the watch in this inotify instance as well as the watch set on the
+	// target.
+	i.watches[w.wd] = w
+	targetWatches.Add(w)
+	return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+	i.nextWatchMinusOne++
+	return i.nextWatchMinusOne
+}
+
+// handleDeletion handles the deletion of the target of watch w. It removes w
+// from i.watches and a watch removal event is generated.
+func (i *Inotify) handleDeletion(w *Watch) {
+	i.mu.Lock()
+	_, found := i.watches[w.wd]
+	delete(i.watches, w.wd)
+	i.mu.Unlock()
+
+	if found {
+		i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+	}
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+	// Note: Locking this inotify instance protects the result returned by
+	// Lookup() below. With the lock held, we know for sure the lookup result
+	// won't become stale because it's impossible for *this* instance to
+	// add/remove watches on target.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Does the target already have a watch from this inotify instance?
+	if existing := target.Watches().Lookup(i.id); existing != nil {
+		newmask := mask
+		if mask&linux.IN_MASK_ADD != 0 {
+			// "Add (OR) events to watch mask for this pathname if it already
+			// exists (instead of replacing mask)." -- inotify(7)
+			newmask |= atomic.LoadUint32(&existing.mask)
+		}
+		atomic.StoreUint32(&existing.mask, newmask)
+		return existing.wd
+	}
+
+	// No existing watch, create a new watch.
+	w := i.newWatchLocked(target, mask)
+	return w.wd
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+	i.mu.Lock()
+
+	// Find the watch we were asked to removed.
+	w, ok := i.watches[wd]
+	if !ok {
+		i.mu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// Remove the watch from this instance.
+	delete(i.watches, wd)
+
+	// Remove the watch from the watch target.
+	w.set.Remove(w.OwnerID())
+	i.mu.Unlock()
+
+	// Generate the event for the removal.
+	i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+	return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// ws is the map of active watches in this collection, keyed by the inotify
+	// instance id of the owner.
+	ws map[uint64]*Watch
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	owner := watch.OwnerID()
+	// Sanity check, we should never have two watches for one owner on the
+	// same target.
+	if _, exists := w.ws[owner]; exists {
+		panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+	}
+	if w.ws == nil {
+		w.ws = make(map[uint64]*Watch)
+	}
+	w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.ws == nil {
+		// This watch set is being destroyed. The thread executing the
+		// destructor is already in the process of deleting all our watches. We
+		// got here with no references on the target because we raced with the
+		// destructor notifying all the watch owners of destruction. See the
+		// comment in Watches.HandleDeletion for why this race exists.
+		return
+	}
+
+	if _, ok := w.ws[id]; !ok {
+		// While there's technically no problem with silently ignoring a missing
+		// watch, this is almost certainly a bug.
+		panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+	}
+	delete(w.ws, id)
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType) {
+	w.NotifyWithExclusions(name, events, cookie, et, false)
+}
+
+// NotifyWithExclusions queues a new event with watches in this set. Watches
+// with IN_EXCL_UNLINK are skipped if the event is coming from a child that
+// has been unlinked.
+func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) {
+	// N.B. We don't defer the unlocks because Notify is in the hot path of
+	// all IO operations, and the defer costs too much for small IO
+	// operations.
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent {
+			continue
+		}
+		watch.Notify(name, events, cookie)
+	}
+	w.mu.RUnlock()
+}
+
+// HandleDeletion is called when the watch target is destroyed to emit
+// the appropriate events.
+func (w *Watches) HandleDeletion() {
+	w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent)
+
+	// TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
+	// by value. Ideally, we wouldn't have this circular locking so we can just
+	// notify of IN_DELETE_SELF in the same loop below.
+	//
+	// We can't hold w.mu while calling watch.handleDeletion to preserve lock
+	// ordering w.r.t to the owner inotify instances. Instead, atomically move
+	// the watches map into a local variable so we can iterate over it safely.
+	//
+	// Because of this however, it is possible for the watches' owners to reach
+	// this inode while the inode has no refs. This is still safe because the
+	// owners can only reach the inode until this function finishes calling
+	// watch.handleDeletion below and the inode is guaranteed to exist in the
+	// meantime. But we still have to be very careful not to rely on inode state
+	// that may have been already destroyed.
+	var ws map[uint64]*Watch
+	w.mu.Lock()
+	ws = w.ws
+	w.ws = nil
+	w.mu.Unlock()
+
+	for _, watch := range ws {
+		// TODO(gvisor.dev/issue/1479): consider refactoring this.
+		watch.handleDeletion()
+	}
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+	// Inotify instance which owns this watch.
+	owner *Inotify
+
+	// Descriptor for this watch. This is unique across an inotify instance.
+	wd int32
+
+	// set is the watch set containing this watch. It belongs to the target file
+	// of this watch.
+	set *Watches
+
+	// Events being monitored via this watch. Must be accessed with atomic
+	// memory operations.
+	mask uint32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+	return w.owner.id
+}
+
+// ExcludeUnlinkedChildren indicates whether the watched object should continue
+// to be notified of events of its children after they have been unlinked, e.g.
+// for an open file descriptor.
+//
+// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
+// We can do this by keeping track of the set of unlinked children in Watches
+// to skip notification.
+func (w *Watch) ExcludeUnlinkedChildren() bool {
+	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+	mask := atomic.LoadUint32(&w.mask)
+	if mask&events == 0 {
+		// We weren't watching for this event.
+		return
+	}
+
+	// Event mask should include bits matched from the watch plus all control
+	// event bits.
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | mask
+	matchedEvents := effectiveMask & events
+	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// handleDeletion handles the deletion of w's target.
+func (w *Watch) handleDeletion() {
+	w.owner.handleDeletion(w)
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+	eventEntry
+
+	wd     int32
+	mask   uint32
+	cookie uint32
+
+	// len is computed based on the name field is set automatically by
+	// Event.setName. It should be 0 when no name is set; otherwise it is the
+	// length of the name slice.
+	len uint32
+
+	// The name field has special padding requirements and should only be set by
+	// calling Event.setName.
+	name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+	e := &Event{
+		wd:     wd,
+		mask:   events,
+		cookie: cookie,
+	}
+	if name != "" {
+		e.setName(name)
+	}
+	return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+	if l < uint32(len(s)+1) {
+		panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+	}
+	b := make([]byte, l)
+	copy(b, s)
+
+	// b was zero-value initialized during make(), so the rest of the slice is
+	// already filled with null bytes.
+
+	return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+	// We need to pad the name such that the entire event length ends up a
+	// multiple of inotifyEventBaseSize.
+	unpaddedLen := len(name) + 1
+	// Round up to nearest multiple of inotifyEventBaseSize.
+	e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+	// Make sure we haven't overflowed and wrapped around when rounding.
+	if unpaddedLen > int(e.len) {
+		panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+	}
+	e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+	s := inotifyEventBaseSize + int(e.len)
+	if s < inotifyEventBaseSize {
+		panic("overflow")
+	}
+	return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+	usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+	usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+	usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+	usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+	writeLen := 0
+
+	n, err := dst.CopyOut(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+	writeLen += n
+	dst = dst.DropFirst(n)
+
+	if e.len > 0 {
+		n, err = dst.CopyOut(ctx, e.name)
+		if err != nil {
+			return 0, err
+		}
+		writeLen += n
+	}
+
+	// Santiy check.
+	if writeLen != e.sizeOf() {
+		panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+	}
+
+	return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+	return e.wd == other.wd &&
+		e.mask == other.mask &&
+		e.cookie == other.cookie &&
+		e.len == other.len &&
+		bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+	var ev uint32
+	if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+		ev |= linux.IN_ATTRIB
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		ev |= linux.IN_MODIFY
+	}
+
+	if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+		// Both times indicates a utime(s) call.
+		ev |= linux.IN_ATTRIB
+	} else if mask&linux.STATX_ATIME != 0 {
+		ev |= linux.IN_ACCESS
+	} else if mask&linux.STATX_MTIME != 0 {
+		mask |= linux.IN_MODIFY
+	}
+	return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+	self.Notify("", linux.IN_ATTRIB, 0, InodeEvent)
+	parent.Notify(name, linux.IN_DELETE, 0, InodeEvent)
+	// TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+	var dirEv uint32
+	if isDir {
+		dirEv = linux.IN_ISDIR
+	}
+	cookie := uniqueid.InotifyCookie(ctx)
+	oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent)
+	newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent)
+	// Somewhat surprisingly, self move events do not have a cookie.
+	renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent)
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 02850b65c..32f901bd8 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -28,9 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// lastMountID is used to allocate mount ids. Must be accessed atomically.
-var lastMountID uint64
-
 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
 // (Mount.fs), which applies to path resolution in the context of a particular
@@ -58,6 +55,10 @@ type Mount struct {
 	// ID is the immutable mount ID.
 	ID uint64
 
+	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+	// for MS_RDONLY which is tracked in "writers". Immutable.
+	Flags MountFlags
+
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
 	// key.parent and key.point if they are not nil.
@@ -84,10 +85,6 @@ type Mount struct {
 	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
 	umounted bool
 
-	// flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
-	// for MS_RDONLY which is tracked in "writers".
-	flags MountFlags
-
 	// The lower 63 bits of writers is the number of calls to
 	// Mount.CheckBeginWrite() that have not yet been paired with a call to
 	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
@@ -97,11 +94,11 @@ type Mount struct {
 
 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
 	mnt := &Mount{
-		ID:    atomic.AddUint64(&lastMountID, 1),
+		ID:    atomic.AddUint64(&vfs.lastMountID, 1),
+		Flags: opts.Flags,
 		vfs:   vfs,
 		fs:    fs,
 		root:  root,
-		flags: opts.Flags,
 		ns:    mntns,
 		refs:  1,
 	}
@@ -111,8 +108,17 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
 	return mnt
 }
 
-// A MountNamespace is a collection of Mounts.
-//
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+	mnt.vfs.mountMu.Lock()
+	defer mnt.vfs.mountMu.Unlock()
+	return MountOptions{
+		Flags:    mnt.Flags,
+		ReadOnly: mnt.readOnly(),
+	}
+}
+
+// A MountNamespace is a collection of Mounts.//
 // MountNamespaces are reference-counted. Unless otherwise specified, all
 // MountNamespace methods require that a reference is held.
 //
@@ -120,6 +126,9 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount
 //
 // +stateify savable
 type MountNamespace struct {
+	// Owner is the usernamespace that owns this mount namespace.
+	Owner *auth.UserNamespace
+
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
@@ -148,7 +157,7 @@ type MountNamespace struct {
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
-		ctx.Warningf("Unknown filesystem: %s", fsTypeName)
+		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
 		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -156,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 		return nil, err
 	}
 	mntns := &MountNamespace{
+		Owner:       creds.UserNamespace,
 		refs:        1,
 		mountpoints: make(map[*Dentry]uint32),
 	}
@@ -175,26 +185,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry,
 	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
 }
 
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
 	rft := vfs.getFilesystemType(fsTypeName)
 	if rft == nil {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	if !opts.InternalMount && !rft.opts.AllowUserMount {
-		return syserror.ENODEV
+		return nil, syserror.ENODEV
 	}
 	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
-		return err
+		return nil, err
 	}
+	defer root.DecRef()
+	defer fs.DecRef()
+	return vfs.NewDisconnectedMount(fs, root, opts)
+}
 
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
 	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
 	if err != nil {
-		root.DecRef()
-		fs.DecRef()
 		return err
 	}
 	vfs.mountMu.Lock()
@@ -204,8 +222,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
-			root.DecRef()
-			fs.DecRef()
 			return syserror.ENOENT
 		}
 		// vd might have been mounted over between vfs.GetDentryAt() and
@@ -238,7 +254,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	// point and the mount root are directories, or neither are, and returns
 	// ENOTDIR if this is not the case.
 	mntns := vd.mount.ns
-	mnt := newMount(vfs, fs, root, mntns, opts)
 	vfs.mounts.seq.BeginWrite()
 	vfs.connectLocked(mnt, vd, mntns)
 	vfs.mounts.seq.EndWrite()
@@ -247,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
 	return nil
 }
 
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+	if err != nil {
+		return err
+	}
+	defer mnt.DecRef()
+	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+		return err
+	}
+	return nil
+}
+
 // UmountAt removes the Mount at the given path.
 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
 	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -254,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
 	}
 
 	// MNT_FORCE is currently unimplemented except for the permission check.
+	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+	// namespace, and not in the owner user namespace for the target mount. See
+	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
 	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
 		return syserror.EPERM
 	}
@@ -369,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
 // references held by vd.
 //
 // Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	if checkInvariants {
+		if mnt.parent() != nil {
+			panic("VFS.connectLocked called on connected mount")
+		}
+	}
+	mnt.IncRef() // dropped by callers of umountRecursiveLocked
 	mnt.storeKey(vd)
 	if vd.mount.children == nil {
 		vd.mount.children = make(map[*Mount]struct{})
 	}
 	vd.mount.children[mnt] = struct{}{}
 	atomic.AddUint32(&vd.dentry.mounts, 1)
+	mnt.ns = mntns
 	mntns.mountpoints[vd.dentry]++
 	vfs.mounts.insertSeqed(mnt)
 	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
@@ -394,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
 // writer critical section. mnt.parent() != nil.
 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
 	vd := mnt.loadKey()
+	if checkInvariants {
+		if vd.mount != nil {
+			panic("VFS.disconnectLocked called on disconnected mount")
+		}
+	}
 	mnt.storeKey(VirtualDentry{})
 	delete(vd.mount.children, mnt)
 	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
@@ -715,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi
 		if mnt.readOnly() {
 			opts = "ro"
 		}
-		if mnt.flags.NoExec {
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
 			opts += ",noexec"
 		}
 
@@ -800,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo
 		if mnt.readOnly() {
 			opts = "ro"
 		}
-		if mnt.flags.NoExec {
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
 			opts += ",noexec"
 		}
-		// TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
-		// set.
 		fmt.Fprintf(buf, "%s ", opts)
 
 		// (7) Optional fields: zero or more fields of the form "tag[:value]".
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bc7581698..70f850ca4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 53d364c5c..f223aeda8 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -75,6 +75,10 @@ type MknodOptions struct {
 type MountFlags struct {
 	// NoExec is equivalent to MS_NOEXEC.
 	NoExec bool
+
+	// NoATime is equivalent to MS_NOATIME and indicates that the
+	// filesystem should not update access time in-place.
+	NoATime bool
 }
 
 // MountOptions contains options to VirtualFilesystem.MountAt().
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 8d7f8f8af..9acca8bc7 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -82,6 +82,10 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// lastMountID is the last allocated mount ID. lastMountID is accessed
+	// using atomic memory operations.
+	lastMountID uint64
+
 	// anonMount is a Mount, not included in mounts or mountpoints,
 	// representing an anonFilesystem. anonMount is used to back
 	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -401,7 +405,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 			vfs.putResolvingPath(rp)
 
 			if opts.FileExec {
-				if fd.Mount().flags.NoExec {
+				if fd.Mount().Flags.NoExec {
 					fd.DecRef()
 					return nil, syserror.EACCES
 				}
@@ -418,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 				}
 			}
 
+			fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
 			return fd, nil
 		}
 		if !rp.handleError(err) {
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index 101497ed6..748273366 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -77,7 +77,10 @@ var DefaultOpts = Opts{
 // trigger it.
 const descheduleThreshold = 1 * time.Second
 
-var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+var (
+	stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+	stuckTasks   = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
 
 // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
 var stackDumpSameTaskPeriod = time.Minute
@@ -220,6 +223,9 @@ func (w *Watchdog) waitForStart() {
 		// We are fine.
 		return
 	}
+
+	stuckStartup.Increment()
+
 	var buf bytes.Buffer
 	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
 	w.doAction(w.StartupTimeoutAction, false, &buf)
@@ -323,13 +329,13 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo
 
 func (w *Watchdog) reportStuckWatchdog() {
 	var buf bytes.Buffer
-	buf.WriteString("Watchdog goroutine is stuck:")
+	buf.WriteString("Watchdog goroutine is stuck")
 	w.doAction(w.TaskTimeoutAction, false, &buf)
 }
 
 // doAction will take the given action. If the action is LogWarning, the stack
-// is not always dumpped to the log to prevent log flooding. "forceStack"
-// guarantees that the stack will be dumped regarless.
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
 func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
 	switch action {
 	case LogWarning:
diff --git a/pkg/sleep/sleep_unsafe.go b/pkg/sleep/sleep_unsafe.go
index 65bfcf778..f68c12620 100644
--- a/pkg/sleep/sleep_unsafe.go
+++ b/pkg/sleep/sleep_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/state/BUILD b/pkg/state/BUILD
index 921af9d63..2b1350135 100644
--- a/pkg/state/BUILD
+++ b/pkg/state/BUILD
@@ -47,6 +47,7 @@ go_library(
         "state.go",
         "stats.go",
     ],
+    marshal = False,
     stateify = False,
     visibility = ["//:sandbox"],
     deps = [
diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD
index 0e35d7d17..d0d77e19c 100644
--- a/pkg/sync/BUILD
+++ b/pkg/sync/BUILD
@@ -39,6 +39,8 @@ go_library(
         "seqcount.go",
         "sync.go",
     ],
+    marshal = False,
+    stateify = False,
 )
 
 go_test(
diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index ad4a3a37e..1d7780695 100644
--- a/pkg/sync/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/sync/mutex_unsafe.go b/pkg/sync/mutex_unsafe.go
index 3dd15578b..dc034d561 100644
--- a/pkg/sync/mutex_unsafe.go
+++ b/pkg/sync/mutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.15
+// +build !go1.16
 
 // When updating the build constraint (above), check that syncMutex matches the
 // standard library sync.Mutex definition.
diff --git a/pkg/sync/rwmutex_unsafe.go b/pkg/sync/rwmutex_unsafe.go
index ea6cdc447..995c0346e 100644
--- a/pkg/sync/rwmutex_unsafe.go
+++ b/pkg/sync/rwmutex_unsafe.go
@@ -4,7 +4,7 @@
 // license that can be found in the LICENSE file.
 
 // +build go1.13
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/syncevent/waiter_unsafe.go b/pkg/syncevent/waiter_unsafe.go
index 112e0e604..ad271e1a0 100644
--- a/pkg/syncevent/waiter_unsafe.go
+++ b/pkg/syncevent/waiter_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.11
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index e57d45f2a..a984f1712 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -22,7 +22,6 @@ go_test(
     size = "small",
     srcs = ["gonet_test.go"],
     library = ":gonet",
-    tags = ["flaky"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 6e0db2741..d82ed5205 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -335,6 +335,11 @@ func (c *TCPConn) Read(b []byte) (int, error) {
 	deadline := c.readCancel()
 
 	numRead := 0
+	defer func() {
+		if numRead != 0 {
+			c.ep.ModerateRecvBuf(numRead)
+		}
+	}()
 	for numRead != len(b) {
 		if len(c.read) == 0 {
 			var err error
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 76839eb92..62ac932bb 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -159,6 +159,11 @@ func (b IPv4) Flags() uint8 {
 	return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13)
 }
 
+// More returns whether the more fragments flag is set.
+func (b IPv4) More() bool {
+	return b.Flags()&IPv4FlagMoreFragments != 0
+}
+
 // TTL returns the "TTL" field of the ipv4 header.
 func (b IPv4) TTL() uint8 {
 	return b[ttl]
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 2c4591409..3499d8399 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -354,6 +354,13 @@ func (b IPv6FragmentExtHdr) ID() uint32 {
 	return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
 }
 
+// IsAtomic returns whether the fragment header indicates an atomic fragment. An
+// atomic fragment is a fragment that contains all the data required to
+// reassemble a full packet.
+func (b IPv6FragmentExtHdr) IsAtomic() bool {
+	return !b.More() && b.FragmentOffset() == 0
+}
+
 // IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
 //
 // The IPv6 payload may contain IPv6 extension headers before any upper layer
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
index 29454c4b9..4c6f808e5 100644
--- a/pkg/tcpip/header/tcp.go
+++ b/pkg/tcpip/header/tcp.go
@@ -66,6 +66,14 @@ const (
 	TCPOptionSACK          = 5
 )
 
+// Option Lengths.
+const (
+	TCPOptionMSSLength           = 4
+	TCPOptionTSLength            = 10
+	TCPOptionWSLength            = 3
+	TCPOptionSackPermittedLength = 2
+)
+
 // TCPFields contains the fields of a TCP packet. It is used to describe the
 // fields of a packet that needs to be encoded.
 type TCPFields struct {
@@ -494,14 +502,11 @@ func ParseTCPOptions(b []byte) TCPOptions {
 // returns without encoding anything. It returns the number of bytes written to
 // the provided buffer.
 func EncodeMSSOption(mss uint32, b []byte) int {
-	// mssOptionSize is the number of bytes in a valid MSS option.
-	const mssOptionSize = 4
-
-	if len(b) < mssOptionSize {
+	if len(b) < TCPOptionMSSLength {
 		return 0
 	}
-	b[0], b[1], b[2], b[3] = TCPOptionMSS, mssOptionSize, byte(mss>>8), byte(mss)
-	return mssOptionSize
+	b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss)
+	return TCPOptionMSSLength
 }
 
 // EncodeWSOption encodes the WS TCP option with the WS value in the
@@ -509,10 +514,10 @@ func EncodeMSSOption(mss uint32, b []byte) int {
 // returns without encoding anything. It returns the number of bytes written to
 // the provided buffer.
 func EncodeWSOption(ws int, b []byte) int {
-	if len(b) < 3 {
+	if len(b) < TCPOptionWSLength {
 		return 0
 	}
-	b[0], b[1], b[2] = TCPOptionWS, 3, uint8(ws)
+	b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws)
 	return int(b[1])
 }
 
@@ -521,10 +526,10 @@ func EncodeWSOption(ws int, b []byte) int {
 // just returns without encoding anything. It returns the number of bytes
 // written to the provided buffer.
 func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
-	if len(b) < 10 {
+	if len(b) < TCPOptionTSLength {
 		return 0
 	}
-	b[0], b[1] = TCPOptionTS, 10
+	b[0], b[1] = TCPOptionTS, TCPOptionTSLength
 	binary.BigEndian.PutUint32(b[2:], tsVal)
 	binary.BigEndian.PutUint32(b[6:], tsEcr)
 	return int(b[1])
@@ -535,11 +540,11 @@ func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
 // encoding anything. It returns the number of bytes written to the provided
 // buffer.
 func EncodeSACKPermittedOption(b []byte) int {
-	if len(b) < 2 {
+	if len(b) < TCPOptionSackPermittedLength {
 		return 0
 	}
 
-	b[0], b[1] = TCPOptionSACKPermitted, 2
+	b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength
 	return int(b[1])
 }
 
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 9bf67686d..20b183da0 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -181,13 +181,13 @@ func (e *Endpoint) NumQueued() int {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -229,13 +229,13 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
 	route.Release()
 	p := PacketInfo{
-		Pkt:   &pkt,
+		Pkt:   pkt,
 		Proto: protocol,
 		GSO:   gso,
 		Route: route,
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index affa1bbdf..f34082e1a 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -387,7 +387,7 @@ const (
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
 		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
@@ -641,8 +641,8 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 3bfb15a8e..eaee7e5d7 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -45,7 +45,7 @@ const (
 type packetInfo struct {
 	raddr    tcpip.LinkAddress
 	proto    tcpip.NetworkProtocolNumber
-	contents stack.PacketBuffer
+	contents *stack.PacketBuffer
 }
 
 type context struct {
@@ -103,7 +103,7 @@ func (c *context) cleanup() {
 	}
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
@@ -179,7 +179,7 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, stack.PacketBuffer{
+	if err := c.ep.WritePacket(r, gso, proto, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 		Hash:   hash,
@@ -295,7 +295,7 @@ func TestPreserveSrcAddress(t *testing.T) {
 	// WritePacket panics given a prependable with anything less than
 	// the minimum size of the ethernet header.
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, proto, stack.PacketBuffer{
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.VectorisedView{},
 	}); err != nil {
@@ -358,7 +358,7 @@ func TestDeliverPacket(t *testing.T) {
 					want := packetInfo{
 						raddr: raddr,
 						proto: proto,
-						contents: stack.PacketBuffer{
+						contents: &stack.PacketBuffer{
 							Data:       buffer.View(b).ToVectorisedView(),
 							LinkHeader: buffer.View(hdr),
 						},
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index fe2bf3b0b..2dfd29aa9 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -191,7 +191,7 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, stack.PacketBuffer{
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, &stack.PacketBuffer{
 		Data:       buffer.View(pkt).ToVectorisedView(),
 		LinkHeader: buffer.View(eth),
 	})
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index cb4cbea69..f04738cfb 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -139,13 +139,13 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	}
 
 	used := d.capViews(n, BufConfig)
-	pkt := stack.PacketBuffer{
+	pkt := &stack.PacketBuffer{
 		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
 		LinkHeader: buffer.View(eth),
 	}
 	pkt.Data.TrimFront(d.e.hdrSize)
 
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -169,7 +169,7 @@ type recvMMsgDispatcher struct {
 
 	// iovecs is an array of array of iovec records where each iovec base
 	// pointer and length are initialzed to the corresponding view above,
-	// except when GSO is neabled then the first iovec in each array of
+	// except when GSO is enabled then the first iovec in each array of
 	// iovecs points to a buffer for the vnet header which is stripped
 	// before the views are passed up the stack for further processing.
 	iovecs [][]syscall.Iovec
@@ -296,12 +296,12 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 
 		used := d.capViews(k, int(n), BufConfig)
-		pkt := stack.PacketBuffer{
+		pkt := &stack.PacketBuffer{
 			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
 			LinkHeader: buffer.View(eth),
 		}
 		pkt.Data.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+		d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 073c84ef9..568c6874f 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,7 +76,7 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
 	views[0] = pkt.Header.View()
 	views = append(views, pkt.Data.Views()...)
@@ -84,7 +84,7 @@ func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Netw
 	// Because we're immediately turning around and writing the packet back
 	// to the rx path, we intentionally don't preserve the remote and local
 	// link addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, &stack.PacketBuffer{
 		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 	})
 
@@ -106,7 +106,7 @@ func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	}
 	linkHeader := header.Ethernet(hdr)
 	vv.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), &stack.PacketBuffer{
 		Data:       vv,
 		LinkHeader: buffer.View(linkHeader),
 	})
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index a5478ce17..c69d6b7e9 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -80,8 +80,8 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
@@ -98,7 +98,7 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts s
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
 		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 87c734c1f..0744f66d6 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -50,7 +50,7 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
 	})
@@ -70,7 +70,7 @@ func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	hdr := buffer.NewPrependable(1)
 	hdr.Prepend(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.NewView(0).ToVectorisedView(),
 	})
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
index 54432194d..b5dfb7850 100644
--- a/pkg/tcpip/link/qdisc/fifo/endpoint.go
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -102,8 +102,8 @@ func (q *queueDispatcher) dispatchLoop() {
 }
 
 // DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
 }
 
 // Attach implements stack.LinkEndpoint.Attach.
@@ -146,7 +146,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 }
 
 // WritePacket implements stack.LinkEndpoint.WritePacket.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// WritePacket caller's do not set the following fields in PacketBuffer
 	// so we populate them here.
 	newRoute := r.Clone()
@@ -154,7 +154,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 	pkt.GSOOptions = gso
 	pkt.NetworkProtocolNumber = protocol
 	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
-	if !d.q.enqueue(&pkt) {
+	if !d.q.enqueue(pkt) {
 		return tcpip.ErrNoBufferSpace
 	}
 	d.newPacketWaker.Assert()
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 0b5a6cf49..99313ee25 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 0796d717e..0374a2441 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -185,7 +185,7 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 
 // WritePacket writes outbound packets to the file descriptor. If it is not
 // currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Add the ethernet header here.
 	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
 	pkt.LinkHeader = buffer.View(eth)
@@ -275,7 +275,7 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 		// Send packet up the stack.
 		eth := header.Ethernet(b[:header.EthernetMinimumSize])
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
+		d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), &stack.PacketBuffer{
 			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
 			LinkHeader: buffer.View(eth),
 		})
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 33f640b85..28a2e88ba 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
 		addr:  remoteLinkAddr,
@@ -273,7 +273,7 @@ func TestSimpleSend(t *testing.T) {
 			randomFill(buf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   buf.ToVectorisedView(),
 			}); err != nil {
@@ -345,7 +345,7 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
 		Header: hdr,
 	}); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
@@ -401,7 +401,7 @@ func TestFillTxQueue(t *testing.T) {
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -419,7 +419,7 @@ func TestFillTxQueue(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -447,7 +447,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -470,7 +470,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -488,7 +488,7 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	}); err != want {
@@ -514,7 +514,7 @@ func TestFillTxMemory(t *testing.T) {
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -533,7 +533,7 @@ func TestFillTxMemory(t *testing.T) {
 
 	// Next attempt to write must fail.
 	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buf.ToVectorisedView(),
 	})
@@ -561,7 +561,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
@@ -577,7 +577,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
 		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   uu,
 		}); err != want {
@@ -588,7 +588,7 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
 		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buf.ToVectorisedView(),
 		}); err != nil {
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index da1c520ae..f2e47b6a7 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -47,13 +47,6 @@ var LogPackets uint32 = 1
 // LogPacketsToPCAP must be accessed atomically.
 var LogPacketsToPCAP uint32 = 1
 
-var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.TransportProtocolNumber]int{
-	header.ICMPv4ProtocolNumber: header.IPv4MinimumSize,
-	header.ICMPv6ProtocolNumber: header.IPv6MinimumSize,
-	header.UDPProtocolNumber:    header.UDPMinimumSize,
-	header.TCPProtocolNumber:    header.TCPMinimumSize,
-}
-
 type endpoint struct {
 	dispatcher stack.NetworkDispatcher
 	lower      stack.LinkEndpoint
@@ -120,9 +113,9 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dumpPacket("recv", nil, protocol, &pkt)
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dumpPacket("recv", nil, protocol, pkt)
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
 }
 
 // Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
@@ -208,8 +201,8 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	e.dumpPacket("send", gso, protocol, &pkt)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.dumpPacket("send", gso, protocol, pkt)
 	return e.lower.WritePacket(r, gso, protocol, pkt)
 }
 
@@ -287,7 +280,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		vv.TrimFront(header.ARPSize)
 		arp := header.ARP(hdr)
 		log.Infof(
-			"%s arp %v (%v) -> %v (%v) valid:%v",
+			"%s arp %s (%s) -> %s (%s) valid:%t",
 			prefix,
 			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
 			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
@@ -299,13 +292,6 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		return
 	}
 
-	// We aren't guaranteed to have a transport header - it's possible for
-	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
-		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
-		return
-	}
-
 	// Figure out the transport layer info.
 	transName := "unknown"
 	srcPort := uint16(0)
@@ -346,7 +332,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 				icmpType = "info reply"
 			}
 		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.ICMPv6ProtocolNumber:
@@ -381,7 +367,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		case header.ICMPv6RedirectMsg:
 			icmpType = "redirect message"
 		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.UDPProtocolNumber:
@@ -428,7 +414,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 					flagsStr[i] = ' '
 				}
 			}
-			details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+			details = fmt.Sprintf("flags:0x%02x (%s) seqnum: %d ack: %d win: %d xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
 			if flags&header.TCPFlagSyn != 0 {
 				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
 			} else {
@@ -437,7 +423,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		}
 
 	default:
-		log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
+		log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
 		return
 	}
 
@@ -445,5 +431,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		details += fmt.Sprintf(" gso: %+v", gso)
 	}
 
-	log.Infof("%s %s %v:%v -> %v:%v len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+	log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
 }
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 617446ea2..6bc9033d0 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -213,7 +213,7 @@ func (d *Device) Write(data []byte) (int64, error) {
 		remote = tcpip.LinkAddress(zeroMAC[:])
 	}
 
-	pkt := stack.PacketBuffer{
+	pkt := &stack.PacketBuffer{
 		Data: buffer.View(data).ToVectorisedView(),
 	}
 	if ethHdr != nil {
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 2b3741276..949b3f2b2 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -50,12 +50,12 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
 	e.dispatchGate.Leave()
 }
 
@@ -99,7 +99,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 54eb5322b..63bf40562 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -35,7 +35,7 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.dispatchCount++
 }
 
@@ -65,7 +65,7 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
@@ -89,21 +89,21 @@ func TestWaitWrite(t *testing.T) {
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
@@ -120,21 +120,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 9d0797af7..7f27a840d 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -80,7 +80,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -94,16 +94,12 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	v, ok := pkt.Data.PullUp(header.ARPSize)
-	if !ok {
-		return
-	}
-	h := header.ARP(v)
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.ARP(pkt.NetworkHeader)
 	if !h.IsValid() {
 		return
 	}
@@ -122,7 +118,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 			Header: hdr,
 		})
 		fallthrough // also fill the cache from requests
@@ -177,7 +173,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 	})
 }
@@ -209,6 +205,17 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.NetworkProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.ARPSize)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(header.ARPSize)
+	return 0, false, true
+}
+
 var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
 
 // NewProtocol returns an ARP network protocol.
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 1646d9cde..66e67429c 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -103,7 +103,7 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.InjectInbound(arp.ProtocolNumber, stack.PacketBuffer{
+		c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
 			Data: v.ToVectorisedView(),
 		})
 	}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index f42abc4bb..2982450f8 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -81,8 +81,8 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t
 	}
 }
 
-// Process processes an incoming fragment belonging to an ID
-// and returns a complete packet when all the packets belonging to that ID have been received.
+// Process processes an incoming fragment belonging to an ID and returns a
+// complete packet when all the packets belonging to that ID have been received.
 func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 4c20301c6..7c8fb3e0a 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,7 +96,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt stack.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
@@ -104,7 +104,7 @@ func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.Trans
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
@@ -150,7 +150,7 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
@@ -246,7 +246,11 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -289,9 +293,9 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, stack.PacketBuffer{
-		Data: view.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -378,10 +382,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.typ = c.expectedTyp
 			o.extra = c.expectedExtra
 
-			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, stack.PacketBuffer{
-				Data: vv,
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
@@ -444,17 +445,17 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, stack.PacketBuffer{
-		Data: frag1.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: frag1.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 0 {
 		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, stack.PacketBuffer{
-		Data: frag2.ToVectorisedView(),
-	})
+	pkt = stack.PacketBuffer{Data: frag2.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -487,7 +488,11 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	}); err != nil {
@@ -530,9 +535,9 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, stack.PacketBuffer{
-		Data: view.ToVectorisedView(),
-	})
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -644,12 +649,25 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, stack.PacketBuffer{
-				Data: view[:len(view)-c.trunc].ToVectorisedView(),
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
 		})
 	}
 }
+
+// truncatedPacket returns a PacketBuffer based on a truncated view. If view,
+// after truncation, is large enough to hold a network header, it makes part of
+// view the packet's NetworkHeader and the rest its Data. Otherwise all of view
+// becomes Data.
+func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer {
+	v := view[:len(view)-trunc]
+	if len(v) < netHdrLen {
+		return &stack.PacketBuffer{Data: v.ToVectorisedView()}
+	}
+	return &stack.PacketBuffer{
+		NetworkHeader: v[:netHdrLen],
+		Data:          v[netHdrLen:].ToVectorisedView(),
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 4cbefe5ab..1b67aa066 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -24,7 +24,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
 	if !ok {
 		return
@@ -56,9 +56,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
 	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
 	if !ok {
 		received.Invalid.Increment()
@@ -88,7 +91,7 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, stack.PacketBuffer{
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, &stack.PacketBuffer{
 			Data:          pkt.Data.Clone(nil),
 			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
 		})
@@ -102,7 +105,11 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 		pkt.SetChecksum(0)
 		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      r.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		}, &stack.PacketBuffer{
 			Header:          hdr,
 			Data:            vv,
 			TransportHeader: buffer.View(pkt),
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 64046cbbf..7e9f16c90 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -21,6 +21,7 @@
 package ipv4
 
 import (
+	"fmt"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -129,7 +130,7 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 // packet's stated length matches the length of the header+payload. mtu
 // includes the IP header and options. This does not support the DontFragment
 // IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
 	ip := header.IPv4(pkt.Header.View())
 	flags := ip.Flags()
@@ -169,7 +170,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 		if i > 0 {
 			newPayload := pkt.Data.Clone(nil)
 			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -188,7 +189,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			newPayload := pkt.Data.Clone(nil)
 			newPayloadLength := outerMTU - pkt.Header.UsedLength()
 			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        pkt.Header,
 				Data:          newPayload,
 				NetworkHeader: buffer.View(h),
@@ -202,7 +203,7 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 			startOfHdr := pkt.Header
 			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
 			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
 				Header:        startOfHdr,
 				Data:          emptyVV,
 				NetworkHeader: buffer.View(h),
@@ -245,7 +246,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -253,43 +254,29 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 	// iptables filtering. All packets that reach here are locally
 	// generated.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(stack.Output, &pkt, gso, r, "", nicName); !ok {
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
 		// iptables is telling us to drop the packet.
 		return nil
 	}
 
+	// If the packet is manipulated as per NAT Ouput rules, handle packet
+	// based on destination address and do not send the packet to link layer.
+	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
+	// only NATted packets, but removing this check short circuits broadcasts
+	// before they are sent out to other hosts.
 	if pkt.NatDone {
-		// If the packet is manipulated as per NAT Ouput rules, handle packet
-		// based on destination address and do not send the packet to link layer.
 		netHeader := header.IPv4(pkt.NetworkHeader)
 		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
 		if err == nil {
-			src := netHeader.SourceAddress()
-			dst := netHeader.DestinationAddress()
-			route := r.ReverseRoute(src, dst)
-
-			views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-			views[0] = pkt.Header.View()
-			views = append(views, pkt.Data.Views()...)
-			packet := stack.PacketBuffer{
-				Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views)}
-			ep.HandlePacket(&route, packet)
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
 			return nil
 		}
 	}
 
 	if r.Loop&stack.PacketLoop != 0 {
-		// The inbound path expects the network header to still be in
-		// the PacketBuffer's Data field.
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
-
-		e.HandlePacket(&loopedR, stack.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
-
+		e.HandlePacket(&loopedR, pkt)
 		loopedR.Release()
 	}
 	if r.Loop&stack.PacketOut == 0 {
@@ -342,23 +329,16 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 		}
 		if _, ok := natPkts[pkt]; ok {
 			netHeader := header.IPv4(pkt.NetworkHeader)
-			ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
-			if err == nil {
+			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
 				src := netHeader.SourceAddress()
 				dst := netHeader.DestinationAddress()
 				route := r.ReverseRoute(src, dst)
-
-				views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-				views[0] = pkt.Header.View()
-				views = append(views, pkt.Data.Views()...)
-				packet := stack.PacketBuffer{
-					Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views)}
-				ep.HandlePacket(&route, packet)
+				ep.HandlePacket(&route, pkt)
 				n++
 				continue
 			}
 		}
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, *pkt); err != nil {
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
 			return n, err
 		}
@@ -370,7 +350,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
@@ -426,35 +406,23 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv4(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
-	h := header.IPv4(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
-	pkt.NetworkHeader = headerView[:h.HeaderLength()]
-
-	hlen := int(h.HeaderLength())
-	tlen := int(h.TotalLength())
-	pkt.Data.TrimFront(hlen)
-	pkt.Data.CapLength(tlen - hlen)
 
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(stack.Input, &pkt, nil, nil, "", ""); !ok {
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
 
-	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
-	if more || h.FragmentOffset() != 0 {
-		if pkt.Data.Size() == 0 {
+	if h.More() || h.FragmentOffset() != 0 {
+		if pkt.Data.Size()+len(pkt.TransportHeader) == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -473,7 +441,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		}
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, pkt.Data)
+		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, h.More(), pkt.Data)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -485,7 +453,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	}
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
-		headerView.CapLength(hlen)
+		pkt.NetworkHeader.CapLength(int(h.HeaderLength()))
 		e.handleICMP(r, pkt)
 		return
 	}
@@ -565,6 +533,35 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// If there are options, pull those into hdr as well.
+	if headerLen := int(ipHdr.HeaderLength()); headerLen > header.IPv4MinimumSize && headerLen <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(headerLen)
+		if !ok {
+			panic(fmt.Sprintf("There are only %d bytes in pkt.Data, but there should be at least %d", pkt.Data.Size(), headerLen))
+		}
+		ipHdr = header.IPv4(hdr)
+	}
+
+	// If this is a fragment, don't bother parsing the transport header.
+	parseTransportHeader := true
+	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
+		parseTransportHeader = false
+	}
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return ipHdr.TransportProtocol(), parseTransportHeader, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 36035c820..11e579c4b 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -114,7 +114,7 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketInfo stack.PacketBuffer, mtu uint32) {
+func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
 	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
@@ -174,7 +174,7 @@ func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketIn
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan stack.PacketBuffer
+	Ch                    chan *stack.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -184,7 +184,7 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan stack.PacketBuffer, size),
+		Ch:                    make(chan *stack.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
@@ -203,7 +203,7 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	select {
 	case e.Ch <- pkt:
 	default:
@@ -282,13 +282,17 @@ func TestFragmentation(t *testing.T) {
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := stack.PacketBuffer{
+			source := &stack.PacketBuffer{
 				Header: hdr,
 				// Save the source payload because WritePacket will modify it.
 				Data: payload.Clone(nil),
 			}
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -296,7 +300,7 @@ func TestFragmentation(t *testing.T) {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []stack.PacketBuffer
+			var results []*stack.PacketBuffer
 		L:
 			for {
 				select {
@@ -338,7 +342,11 @@ func TestFragmentationErrors(t *testing.T) {
 		t.Run(ft.description, func(t *testing.T) {
 			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
 				Header: hdr,
 				Data:   payload,
 			})
@@ -460,7 +468,7 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.PacketBuffer{
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, &stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
 				})
 			}
@@ -644,6 +652,18 @@ func TestReceiveFragments(t *testing.T) {
 			},
 			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
 		},
+		{
+			name: "Fragment without followup",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+			},
+			expectedPayloads: nil,
+		},
 	}
 
 	for _, test := range tests {
@@ -698,7 +718,7 @@ func TestReceiveFragments(t *testing.T) {
 				vv := hdr.View().ToVectorisedView()
 				vv.AppendView(frag.payload)
 
-				e.InjectInbound(header.IPv4ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(header.IPv4ProtocolNumber, &stack.PacketBuffer{
 					Data: vv,
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index bdf3a0d25..2ff7eedf4 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -27,7 +27,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
 	if !ok {
 		return
@@ -70,17 +70,20 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
 	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
 	if !ok {
 		received.Invalid.Increment()
 		return
 	}
 	h := header.ICMPv6(v)
-	iph := header.IPv6(netHeader)
+	iph := header.IPv6(pkt.NetworkHeader)
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
@@ -288,7 +291,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 		}); err != nil {
 			sent.Dropped.Increment()
@@ -390,7 +393,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   pkt.Data,
 		}); err != nil {
@@ -532,7 +535,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
 		Header: hdr,
 	})
 }
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d412ff688..52a01b44e 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -57,7 +57,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, stack.PacketBuffer) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -67,7 +67,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, stack.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -179,36 +179,32 @@ func TestICMPCounts(t *testing.T) {
 		},
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable) {
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	handleIPv6Payload := func(icmp header.ICMPv6) {
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
+			PayloadLength: uint16(len(icmp)),
 			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 			HopLimit:      header.NDPHopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, stack.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
+		ep.HandlePacket(&r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          buffer.View(icmp).ToVectorisedView(),
 		})
 	}
 
 	for _, typ := range types {
-		extraDataLen := len(typ.extraData)
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-		extraData := buffer.View(hdr.Prepend(extraDataLen))
-		copy(extraData, typ.extraData)
-		pkt := header.ICMPv6(hdr.Prepend(typ.size))
-		pkt.SetType(typ.typ)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
-
-		handleIPv6Payload(hdr)
+		icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+		copy(icmp[typ.size:], typ.extraData)
+		icmp.SetType(typ.typ)
+		icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+		handleIPv6Payload(icmp)
 	}
 
 	// Construct an empty ICMP packet so that
 	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
-	handleIPv6Payload(buffer.NewPrependable(header.IPv6MinimumSize))
+	handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
 
 	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
 	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
@@ -328,7 +324,7 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
 		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
 		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), stack.PacketBuffer{
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), &stack.PacketBuffer{
 			Data: vv,
 		})
 	}
@@ -546,25 +542,22 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			}
 
 			handleIPv6Payload := func(checksum bool) {
-				extraDataLen := len(typ.extraData)
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-				extraData := buffer.View(hdr.Prepend(extraDataLen))
-				copy(extraData, typ.extraData)
-				pkt := header.ICMPv6(hdr.Prepend(typ.size))
-				pkt.SetType(typ.typ)
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView()))
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
 				}
-				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
 				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(typ.size + extraDataLen),
+					PayloadLength: uint16(len(icmp)),
 					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 					HopLimit:      header.NDPHopLimit,
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
-					Data: hdr.View().ToVectorisedView(),
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
 				})
 			}
 
@@ -740,7 +733,7 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 					Data: hdr.View().ToVectorisedView(),
 				})
 			}
@@ -918,7 +911,7 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index daf1fcbc6..95fbcf2d1 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -116,7 +116,7 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
 	pkt.NetworkHeader = buffer.View(ip)
 
@@ -128,7 +128,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
 		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, stack.PacketBuffer{
+		e.HandlePacket(&loopedR, &stack.PacketBuffer{
 			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
 		})
 
@@ -163,30 +163,28 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
-	if !ok {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv6(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
-	h := header.IPv6(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
-
-	pkt.NetworkHeader = headerView[:header.IPv6MinimumSize]
-	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	pkt.Data.CapLength(int(h.PayloadLength()))
 
-	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data)
+	// vv consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	vv := pkt.NetworkHeader[header.IPv6MinimumSize:].ToVectorisedView()
+	vv.AppendView(pkt.TransportHeader)
+	vv.Append(pkt.Data)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
 	hasFragmentHeader := false
 
 	for firstHeader := true; ; firstHeader = false {
@@ -262,9 +260,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		case header.IPv6FragmentExtHdr:
 			hasFragmentHeader = true
 
-			fragmentOffset := extHdr.FragmentOffset()
-			more := extHdr.More()
-			if !more && fragmentOffset == 0 {
+			if extHdr.IsAtomic() {
 				// This fragment extension header indicates that this packet is an
 				// atomic fragment. An atomic fragment is a fragment that contains
 				// all the data required to reassemble a full packet. As per RFC 6946,
@@ -277,9 +273,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			// Don't consume the iterator if we have the first fragment because we
 			// will use it to validate that the first fragment holds the upper layer
 			// header.
-			rawPayload := it.AsRawHeader(fragmentOffset != 0 /* consume */)
+			rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */)
 
-			if fragmentOffset == 0 {
+			if extHdr.FragmentOffset() == 0 {
 				// Check that the iterator ends with a raw payload as the first fragment
 				// should include all headers up to and including any upper layer
 				// headers, as per RFC 8200 section 4.5; only upper layer data
@@ -332,7 +328,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			}
 
 			// The packet is a fragment, let's try to reassemble it.
-			start := fragmentOffset * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
+			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
 			last := start + uint16(fragmentPayloadLen) - 1
 
 			// Drop the packet if the fragmentOffset is incorrect. i.e the
@@ -345,7 +341,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			}
 
 			var ready bool
-			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, more, rawPayload.Buf)
+			// Note that pkt doesn't have its transport header set after reassembly,
+			// and won't until DeliverNetworkPacket sets it.
+			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, extHdr.More(), rawPayload.Buf)
 			if err != nil {
 				r.Stats().IP.MalformedPacketsReceived.Increment()
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -394,10 +392,17 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		case header.IPv6RawPayloadHeader:
 			// If the last header in the payload isn't a known IPv6 extension header,
 			// handle it as if it is transport layer data.
+
+			// For unfragmented packets, extHdr still contains the transport header.
+			// Get rid of it.
+			//
+			// For reassembled fragments, pkt.TransportHeader is unset, so this is a
+			// no-op and pkt.Data begins with the transport header.
+			extHdr.Buf.TrimFront(len(pkt.TransportHeader))
 			pkt.Data = extHdr.Buf
 
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
-				e.handleICMP(r, headerView, pkt, hasFragmentHeader)
+				e.handleICMP(r, pkt, hasFragmentHeader)
 			} else {
 				r.Stats().IP.PacketsDelivered.Increment()
 				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
@@ -505,6 +510,79 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	//
+	// Parsing occurs again in HandlePacket because we don't track the
+	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
+	// has to do the parsing work again.
+	var nextHdr tcpip.TransportProtocolNumber
+	foundNext := true
+	extensionsSize := 0
+traverseExtensions:
+	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
+		if err != nil {
+			break
+		}
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			foundNext = false
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			// If this is an atomic fragment, we don't have to treat it specially.
+			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
+				continue
+			}
+			// This is a non-atomic fragment and has to be re-assembled before we can
+			// examine the payload for a transport header.
+			foundNext = false
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader.
+	hdr, ok = pkt.Data.PullUp(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+
+	return nextHdr, foundNext, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 841a0cb7a..213ff64f2 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -65,7 +65,7 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -123,7 +123,7 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 		Data: hdr.View().ToVectorisedView(),
 	})
 
@@ -637,7 +637,7 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				DstAddr:       addr2,
 			})
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -1238,7 +1238,7 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 				vv := hdr.View().ToVectorisedView()
 				vv.Append(f.data)
 
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 					Data: vv,
 				})
 			}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 12b70f7e9..64239ce9a 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -136,7 +136,7 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -380,7 +380,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.PacketBuffer{
+			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -497,7 +497,7 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -551,25 +551,29 @@ func TestNDPValidation(t *testing.T) {
 		return s, ep, r
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
 		nextHdr := uint8(header.ICMPv6ProtocolNumber)
+		var extensions buffer.View
 		if atomicFragment {
-			bytes := hdr.Prepend(header.IPv6FragmentExtHdrLength)
-			bytes[0] = nextHdr
+			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+			extensions[0] = nextHdr
 			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
 		}
 
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize + len(extensions)))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
+			PayloadLength: uint16(len(payload) + len(extensions)),
 			NextHeader:    nextHdr,
 			HopLimit:      hopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, stack.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
+		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+		}
+		ep.HandlePacket(r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          payload.ToVectorisedView(),
 		})
 	}
 
@@ -676,14 +680,11 @@ func TestNDPValidation(t *testing.T) {
 					invalid := stats.Invalid
 					typStat := typ.statCounter(stats)
 
-					extraDataLen := len(typ.extraData)
-					hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen + header.IPv6FragmentExtHdrLength)
-					extraData := buffer.View(hdr.Prepend(extraDataLen))
-					copy(extraData, typ.extraData)
-					pkt := header.ICMPv6(hdr.Prepend(typ.size))
-					pkt.SetType(typ.typ)
-					pkt.SetCode(test.code)
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
+					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+					copy(icmp[typ.size:], typ.extraData)
+					icmp.SetType(typ.typ)
+					icmp.SetCode(test.code)
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
 
 					// Rx count of the NDP message should initially be 0.
 					if got := typStat.Value(); got != 0 {
@@ -699,7 +700,7 @@ func TestNDPValidation(t *testing.T) {
 						t.FailNow()
 					}
 
-					handleIPv6Payload(hdr, test.hopLimit, test.atomicFragment, ep, &r)
+					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
 
 					// Rx count of the NDP packet should have increased.
 					if got := typStat.Value(); got != 1 {
@@ -884,7 +885,7 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index b937cb84b..edc29ad27 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -54,17 +54,27 @@ type Flags struct {
 	LoadBalanced bool
 }
 
-func (f Flags) bits() reuseFlag {
-	var rf reuseFlag
+// Bits converts the Flags to their bitset form.
+func (f Flags) Bits() BitFlags {
+	var rf BitFlags
 	if f.MostRecent {
-		rf |= mostRecentFlag
+		rf |= MostRecentFlag
 	}
 	if f.LoadBalanced {
-		rf |= loadBalancedFlag
+		rf |= LoadBalancedFlag
 	}
 	return rf
 }
 
+// Effective returns the effective behavior of a flag config.
+func (f Flags) Effective() Flags {
+	e := f
+	if e.LoadBalanced && e.MostRecent {
+		e.MostRecent = false
+	}
+	return e
+}
+
 // PortManager manages allocating, reserving and releasing ports.
 type PortManager struct {
 	mu             sync.RWMutex
@@ -78,56 +88,88 @@ type PortManager struct {
 	hint uint32
 }
 
-type reuseFlag int
+// BitFlags is a bitset representation of Flags.
+type BitFlags uint32
 
 const (
-	mostRecentFlag reuseFlag = 1 << iota
-	loadBalancedFlag
+	// MostRecentFlag represents Flags.MostRecent.
+	MostRecentFlag BitFlags = 1 << iota
+
+	// LoadBalancedFlag represents Flags.LoadBalanced.
+	LoadBalancedFlag
+
+	// nextFlag is the value that the next added flag will have.
+	//
+	// It is used to calculate FlagMask below. It is also the number of
+	// valid flag states.
 	nextFlag
 
-	flagMask = nextFlag - 1
+	// FlagMask is a bit mask for BitFlags.
+	FlagMask = nextFlag - 1
 )
 
-type portNode struct {
-	// refs stores the count for each possible flag combination.
+// ToFlags converts the bitset into a Flags struct.
+func (f BitFlags) ToFlags() Flags {
+	return Flags{
+		MostRecent:   f&MostRecentFlag != 0,
+		LoadBalanced: f&LoadBalancedFlag != 0,
+	}
+}
+
+// FlagCounter counts how many references each flag combination has.
+type FlagCounter struct {
+	// refs stores the count for each possible flag combination, (0 though
+	// FlagMask).
 	refs [nextFlag]int
 }
 
-func (p portNode) totalRefs() int {
+// AddRef increases the reference count for a specific flag combination.
+func (c *FlagCounter) AddRef(flags BitFlags) {
+	c.refs[flags]++
+}
+
+// DropRef decreases the reference count for a specific flag combination.
+func (c *FlagCounter) DropRef(flags BitFlags) {
+	c.refs[flags]--
+}
+
+// TotalRefs calculates the total number of references for all flag
+// combinations.
+func (c FlagCounter) TotalRefs() int {
 	var total int
-	for _, r := range p.refs {
+	for _, r := range c.refs {
 		total += r
 	}
 	return total
 }
 
-// flagRefs returns the number of references with all specified flags.
-func (p portNode) flagRefs(flags reuseFlag) int {
+// FlagRefs returns the number of references with all specified flags.
+func (c FlagCounter) FlagRefs(flags BitFlags) int {
 	var total int
-	for i, r := range p.refs {
-		if reuseFlag(i)&flags == flags {
+	for i, r := range c.refs {
+		if BitFlags(i)&flags == flags {
 			total += r
 		}
 	}
 	return total
 }
 
-// allRefsHave returns if all references have all specified flags.
-func (p portNode) allRefsHave(flags reuseFlag) bool {
-	for i, r := range p.refs {
-		if reuseFlag(i)&flags == flags && r > 0 {
+// AllRefsHave returns if all references have all specified flags.
+func (c FlagCounter) AllRefsHave(flags BitFlags) bool {
+	for i, r := range c.refs {
+		if BitFlags(i)&flags != flags && r > 0 {
 			return false
 		}
 	}
 	return true
 }
 
-// intersectionRefs returns the set of flags shared by all references.
-func (p portNode) intersectionRefs() reuseFlag {
-	intersection := flagMask
-	for i, r := range p.refs {
+// IntersectionRefs returns the set of flags shared by all references.
+func (c FlagCounter) IntersectionRefs() BitFlags {
+	intersection := FlagMask
+	for i, r := range c.refs {
 		if r > 0 {
-			intersection &= reuseFlag(i)
+			intersection &= BitFlags(i)
 		}
 	}
 	return intersection
@@ -135,26 +177,26 @@ func (p portNode) intersectionRefs() reuseFlag {
 
 // deviceNode is never empty. When it has no elements, it is removed from the
 // map that references it.
-type deviceNode map[tcpip.NICID]portNode
+type deviceNode map[tcpip.NICID]FlagCounter
 
 // isAvailable checks whether binding is possible by device. If not binding to a
-// device, check against all portNodes. If binding to a specific device, check
+// device, check against all FlagCounters. If binding to a specific device, check
 // against the unspecified device and the provided device.
 //
 // If either of the port reuse flags is enabled on any of the nodes, all nodes
 // sharing a port must share at least one reuse flag. This matches Linux's
 // behavior.
 func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
-	flagBits := flags.bits()
+	flagBits := flags.Bits()
 	if bindToDevice == 0 {
 		// Trying to binding all devices.
 		if flagBits == 0 {
 			// Can't bind because the (addr,port) is already bound.
 			return false
 		}
-		intersection := flagMask
+		intersection := FlagMask
 		for _, p := range d {
-			i := p.intersectionRefs()
+			i := p.IntersectionRefs()
 			intersection &= i
 			if intersection&flagBits == 0 {
 				// Can't bind because the (addr,port) was
@@ -165,17 +207,17 @@ func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
 		return true
 	}
 
-	intersection := flagMask
+	intersection := FlagMask
 
 	if p, ok := d[0]; ok {
-		intersection = p.intersectionRefs()
+		intersection = p.IntersectionRefs()
 		if intersection&flagBits == 0 {
 			return false
 		}
 	}
 
 	if p, ok := d[bindToDevice]; ok {
-		i := p.intersectionRefs()
+		i := p.IntersectionRefs()
 		intersection &= i
 		if intersection&flagBits == 0 {
 			return false
@@ -324,7 +366,7 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
 		return false
 	}
-	flagBits := flags.bits()
+	flagBits := flags.Bits()
 
 	// Reserve port on all network protocols.
 	for _, network := range networks {
@@ -340,7 +382,7 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			m[addr] = d
 		}
 		n := d[bindToDevice]
-		n.refs[flagBits]++
+		n.AddRef(flagBits)
 		d[bindToDevice] = n
 	}
 
@@ -353,7 +395,7 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	flagBits := flags.bits()
+	flagBits := flags.Bits()
 
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
@@ -368,7 +410,7 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 			}
 			n.refs[flagBits]--
 			d[bindToDevice] = n
-			if n.refs == [nextFlag]int{} {
+			if n.TotalRefs() == 0 {
 				delete(d, bindToDevice)
 			}
 			if len(d) == 0 {
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index f71073207..24f52b735 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -89,6 +89,7 @@ go_test(
         "//pkg/tcpip/link/loopback",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
@@ -110,5 +111,6 @@ go_test(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
     ],
 )
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 7d1ede1f2..05bf62788 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -20,7 +20,6 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
@@ -147,46 +146,8 @@ type ConnTrackTable struct {
 	Seed uint32
 }
 
-// parseHeaders sets headers in the packet.
-func parseHeaders(pkt *PacketBuffer) {
-	newPkt := pkt.Clone()
-
-	// Set network header.
-	hdr, ok := newPkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		return
-	}
-	netHeader := header.IPv4(hdr)
-	newPkt.NetworkHeader = hdr
-	length := int(netHeader.HeaderLength())
-
-	// TODO(gvisor.dev/issue/170): Need to support for other
-	// protocols as well.
-	// Set transport header.
-	switch protocol := netHeader.TransportProtocol(); protocol {
-	case header.UDPProtocolNumber:
-		if newPkt.TransportHeader == nil {
-			h, ok := newPkt.Data.PullUp(length + header.UDPMinimumSize)
-			if !ok {
-				return
-			}
-			newPkt.TransportHeader = buffer.View(header.UDP(h[length:]))
-		}
-	case header.TCPProtocolNumber:
-		if newPkt.TransportHeader == nil {
-			h, ok := newPkt.Data.PullUp(length + header.TCPMinimumSize)
-			if !ok {
-				return
-			}
-			newPkt.TransportHeader = buffer.View(header.TCP(h[length:]))
-		}
-	}
-	pkt.NetworkHeader = newPkt.NetworkHeader
-	pkt.TransportHeader = newPkt.TransportHeader
-}
-
 // packetToTuple converts packet to a tuple in original direction.
-func packetToTuple(pkt PacketBuffer, hook Hook) (connTrackTuple, *tcpip.Error) {
+func packetToTuple(pkt *PacketBuffer, hook Hook) (connTrackTuple, *tcpip.Error) {
 	var tuple connTrackTuple
 
 	netHeader := header.IPv4(pkt.NetworkHeader)
@@ -257,15 +218,8 @@ func (ct *ConnTrackTable) getTupleHash(tuple connTrackTuple) uint32 {
 // TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support other
 // transport protocols.
 func (ct *ConnTrackTable) connTrackForPacket(pkt *PacketBuffer, hook Hook, createConn bool) (*connTrack, ctDirection) {
-	if hook == Prerouting {
-		// Headers will not be set in Prerouting.
-		// TODO(gvisor.dev/issue/170): Change this after parsing headers
-		// code is added.
-		parseHeaders(pkt)
-	}
-
 	var dir ctDirection
-	tuple, err := packetToTuple(*pkt, hook)
+	tuple, err := packetToTuple(pkt, hook)
 	if err != nil {
 		return nil, dir
 	}
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
index 6b64cd37f..3eff141e6 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/forwarder.go
@@ -32,7 +32,7 @@ type pendingPacket struct {
 	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
-	pkt   PacketBuffer
+	pkt   *PacketBuffer
 }
 
 type forwardQueue struct {
@@ -50,7 +50,7 @@ func newForwardQueue() *forwardQueue {
 	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	shouldWait := false
 
 	f.Lock()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 8084d50bc..a6546cef0 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -33,6 +33,10 @@ const (
 	// except where another value is explicitly used. It is chosen to match
 	// the MTU of loopback interfaces on linux systems.
 	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
 )
 
 // fwdTestNetworkEndpoint is a network-layer protocol endpoint.
@@ -68,16 +72,9 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
-	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
-	if !ok {
-		return
-	}
-	pkt.Data.TrimFront(fwdTestNetHeaderLen)
-
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
 }
 
 func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -96,13 +93,13 @@ func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNu
 	return f.proto.Number()
 }
 
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
 	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
-	b[0] = r.RemoteAddress[0]
-	b[1] = f.id.LocalAddress[0]
-	b[2] = byte(params.Protocol)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = f.id.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
 
 	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
 }
@@ -112,7 +109,7 @@ func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuf
 	panic("not implemented")
 }
 
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error {
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -140,7 +137,17 @@ func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
 }
 
 func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = netHeader
+	pkt.Data.TrimFront(fwdTestNetHeaderLen)
+	return tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), true, true
 }
 
 func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
@@ -190,7 +197,7 @@ func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumb
 type fwdTestPacketInfo struct {
 	RemoteLinkAddress tcpip.LinkAddress
 	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               PacketBuffer
+	Pkt               *PacketBuffer
 }
 
 type fwdTestLinkEndpoint struct {
@@ -203,13 +210,13 @@ type fwdTestLinkEndpoint struct {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -251,7 +258,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error {
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
 	p := fwdTestPacketInfo{
 		RemoteLinkAddress: r.RemoteLinkAddress,
 		LocalLinkAddress:  r.LocalLinkAddress,
@@ -270,7 +277,7 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw
 func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	n := 0
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.WritePacket(r, gso, protocol, *pkt)
+		e.WritePacket(r, gso, protocol, pkt)
 		n++
 	}
 
@@ -280,7 +287,7 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuffer
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := fwdTestPacketInfo{
-		Pkt: PacketBuffer{Data: vv},
+		Pkt: &PacketBuffer{Data: vv},
 	}
 
 	select {
@@ -361,8 +368,8 @@ func TestForwardingWithStaticResolver(t *testing.T) {
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -398,8 +405,8 @@ func TestForwardingWithFakeResolver(t *testing.T) {
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -429,8 +436,8 @@ func TestForwardingWithNoResolver(t *testing.T) {
 	// inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -459,16 +466,16 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// Inject an inbound packet to address 4 on NIC 1. This packet should
 	// not be forwarded.
 	buf := buffer.NewView(30)
-	buf[0] = 4
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 4
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf = buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 
@@ -480,9 +487,8 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Data.ToView()
-	if b[0] != 3 {
-		t.Fatalf("got b[0] = %d, want = 3", b[0])
+	if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
 	}
 
 	// Test that the address resolution happened correctly.
@@ -509,8 +515,8 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 	// Inject two inbound packets to address 3 on NIC 1.
 	for i := 0; i < 2; i++ {
 		buf := buffer.NewView(30)
-		buf[0] = 3
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		buf[dstAddrOffset] = 3
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -524,9 +530,8 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
-		if b[0] != 3 {
-			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
 		}
 
 		// Test that the address resolution happened correctly.
@@ -554,10 +559,10 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
 		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
 		buf := buffer.NewView(30)
-		buf[0] = 3
+		buf[dstAddrOffset] = 3
 		// Set the packet sequence number.
 		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -571,14 +576,18 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
-		if b[0] != 3 {
-			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		if b := p.Pkt.Header.View(); b[dstAddrOffset] != 3 {
+			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+		}
+		seqNumBuf, ok := p.Pkt.Data.PullUp(2) // The sequence number is a uint16 (2 bytes).
+		if !ok {
+			t.Fatalf("p.Pkt.Data is too short to hold a sequence number: %d", p.Pkt.Data.Size())
 		}
-		// The first 5 packets should not be forwarded so the the
-		// sequemnce number should start with 5.
+
+		// The first 5 packets should not be forwarded so the sequence number should
+		// start with 5.
 		want := uint16(i + 5)
-		if n := binary.BigEndian.Uint16(b[fwdTestNetHeaderLen:]); n != want {
+		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
 			t.Fatalf("got the packet #%d, want = #%d", n, want)
 		}
 
@@ -609,8 +618,8 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 		// Each packet has a different destination address (3 to
 		// maxPendingResolutions + 7).
 		buf := buffer.NewView(30)
-		buf[0] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		buf[dstAddrOffset] = byte(3 + i)
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
 			Data: buf.ToVectorisedView(),
 		})
 	}
@@ -626,9 +635,8 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Data.ToView()
-		if b[0] < 8 {
-			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
+		if p.Pkt.NetworkHeader[dstAddrOffset] < 8 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", p.Pkt.NetworkHeader[dstAddrOffset])
 		}
 
 		// Test that the address resolution happened correctly.
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 443423b3c..4e9b404c8 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -16,7 +16,6 @@ package stack
 
 import (
 	"fmt"
-	"strings"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -44,11 +43,11 @@ const HookUnset = -1
 
 // DefaultTables returns a default set of tables. Each chain is set to accept
 // all packets.
-func DefaultTables() IPTables {
+func DefaultTables() *IPTables {
 	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
 	// iotas.
-	return IPTables{
-		Tables: map[string]Table{
+	return &IPTables{
+		tables: map[string]Table{
 			TablenameNat: Table{
 				Rules: []Rule{
 					Rule{Target: AcceptTarget{}},
@@ -107,7 +106,7 @@ func DefaultTables() IPTables {
 				UserChains: map[string]int{},
 			},
 		},
-		Priorities: map[Hook][]string{
+		priorities: map[Hook][]string{
 			Input:      []string{TablenameNat, TablenameFilter},
 			Prerouting: []string{TablenameMangle, TablenameNat},
 			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
@@ -159,6 +158,36 @@ func EmptyNatTable() Table {
 	}
 }
 
+// GetTable returns table by name.
+func (it *IPTables) GetTable(name string) (Table, bool) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	t, ok := it.tables[name]
+	return t, ok
+}
+
+// ReplaceTable replaces or inserts table by name.
+func (it *IPTables) ReplaceTable(name string, table Table) {
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	it.tables[name] = table
+}
+
+// ModifyTables acquires write-lock and calls fn with internal name-to-table
+// map. This function can be used to update multiple tables atomically.
+func (it *IPTables) ModifyTables(fn func(map[string]Table)) {
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	fn(it.tables)
+}
+
+// GetPriorities returns slice of priorities associated with hook.
+func (it *IPTables) GetPriorities(hook Hook) []string {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	return it.priorities[hook]
+}
+
 // A chainVerdict is what a table decides should be done with a packet.
 type chainVerdict int
 
@@ -185,8 +214,8 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 	it.connections.HandlePacket(pkt, hook, gso, r)
 
 	// Go through each table containing the hook.
-	for _, tablename := range it.Priorities[hook] {
-		table := it.Tables[tablename]
+	for _, tablename := range it.GetPriorities(hook) {
+		table, _ := it.GetTable(tablename)
 		ruleIdx := table.BuiltinChains[hook]
 		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
 		// If the table returns Accept, move on to the next table.
@@ -314,7 +343,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	}
 
 	// Check whether the packet matches the IP header filter.
-	if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader), hook, nicName) {
+	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader), hook, nicName) {
 		// Continue on to the next rule.
 		return RuleJump, ruleIdx + 1
 	}
@@ -322,7 +351,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	// Go through each rule matcher. If they all match, run
 	// the rule target.
 	for _, matcher := range rule.Matchers {
-		matches, hotdrop := matcher.Match(hook, *pkt, "")
+		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
 			return RuleDrop, 0
 		}
@@ -335,47 +364,3 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	// All the matchers matched, so run the target.
 	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
 }
-
-func filterMatch(filter IPHeaderFilter, hdr header.IPv4, hook Hook, nicName string) bool {
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	// Check the transport protocol.
-	if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() {
-		return false
-	}
-
-	// Check the destination IP.
-	dest := hdr.DestinationAddress()
-	matches := true
-	for i := range filter.Dst {
-		if dest[i]&filter.DstMask[i] != filter.Dst[i] {
-			matches = false
-			break
-		}
-	}
-	if matches == filter.DstInvert {
-		return false
-	}
-
-	// Check the output interface.
-	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
-	// hooks after supported.
-	if hook == Output {
-		n := len(filter.OutputInterface)
-		if n == 0 {
-			return true
-		}
-
-		// If the interface name ends with '+', any interface which begins
-		// with the name should be matched.
-		ifName := filter.OutputInterface
-		matches = true
-		if strings.HasSuffix(ifName, "+") {
-			matches = strings.HasPrefix(nicName, ifName[:n-1])
-		} else {
-			matches = nicName == ifName
-		}
-		return filter.OutputInterfaceInvert != matches
-	}
-
-	return true
-}
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 36cc6275d..92e31643e 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -98,11 +98,6 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrackTable, hook Hook
 		return RuleAccept, 0
 	}
 
-	// Set network header.
-	if hook == Prerouting {
-		parseHeaders(pkt)
-	}
-
 	// Drop the packet if network and transport header are not set.
 	if pkt.NetworkHeader == nil || pkt.TransportHeader == nil {
 		return RuleDrop, 0
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index fe06007ae..4a6a5c6f1 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -15,7 +15,11 @@
 package stack
 
 import (
+	"strings"
+	"sync"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // A Hook specifies one of the hooks built into the network stack.
@@ -75,13 +79,17 @@ const (
 
 // IPTables holds all the tables for a netstack.
 type IPTables struct {
-	// Tables maps table names to tables. User tables have arbitrary names.
-	Tables map[string]Table
+	// mu protects tables and priorities.
+	mu sync.RWMutex
+
+	// tables maps table names to tables. User tables have arbitrary names. mu
+	// needs to be locked for accessing.
+	tables map[string]Table
 
-	// Priorities maps each hook to a list of table names. The order of the
+	// priorities maps each hook to a list of table names. The order of the
 	// list is the order in which each table should be visited for that
-	// hook.
-	Priorities map[Hook][]string
+	// hook. mu needs to be locked for accessing.
+	priorities map[Hook][]string
 
 	connections ConnTrackTable
 }
@@ -159,6 +167,16 @@ type IPHeaderFilter struct {
 	// comparison.
 	DstInvert bool
 
+	// Src matches the source IP address.
+	Src tcpip.Address
+
+	// SrcMask masks bits of the source IP address when comparing with Src.
+	SrcMask tcpip.Address
+
+	// SrcInvert inverts the meaning of the source IP check, i.e. when true the
+	// filter will match packets that fail the source comparison.
+	SrcInvert bool
+
 	// OutputInterface matches the name of the outgoing interface for the
 	// packet.
 	OutputInterface string
@@ -173,6 +191,55 @@ type IPHeaderFilter struct {
 	OutputInterfaceInvert bool
 }
 
+// match returns whether hdr matches the filter.
+func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the source and destination IPs.
+	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+		return false
+	}
+
+	// Check the output interface.
+	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
+	// hooks after supported.
+	if hook == Output {
+		n := len(fl.OutputInterface)
+		if n == 0 {
+			return true
+		}
+
+		// If the interface name ends with '+', any interface which begins
+		// with the name should be matched.
+		ifName := fl.OutputInterface
+		matches := true
+		if strings.HasSuffix(ifName, "+") {
+			matches = strings.HasPrefix(nicName, ifName[:n-1])
+		} else {
+			matches = nicName == ifName
+		}
+		return fl.OutputInterfaceInvert != matches
+	}
+
+	return true
+}
+
+// filterAddress returns whether addr matches the filter.
+func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
+	matches := true
+	for i := range filterAddr {
+		if addr[i]&mask[i] != filterAddr[i] {
+			matches = false
+			break
+		}
+	}
+	return matches != invert
+}
+
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Name returns the name of the Matcher.
@@ -183,7 +250,7 @@ type Matcher interface {
 	// used for suspicious packets.
 	//
 	// Precondition: packet.NetworkHeader is set.
-	Match(hook Hook, packet PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
 // A Target is the interface for taking an action for a packet.
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 526c7d6ff..e28c23d66 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -467,8 +467,17 @@ type ndpState struct {
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
 
-	// The timer used to send the next router solicitation message.
-	rtrSolicitTimer *time.Timer
+	rtrSolicit struct {
+		// The timer used to send the next router solicitation message.
+		timer *time.Timer
+
+		// Used to let the Router Solicitation timer know that it has been stopped.
+		//
+		// Must only be read from or written to while protected by the lock of
+		// the NIC this ndpState is associated with. MUST be set when the timer is
+		// set.
+		done *bool
+	}
 
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
@@ -648,13 +657,14 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	// as starting a goroutine but we use a timer that fires immediately so we can
 	// reset it for the next DAD iteration.
 	timer = time.AfterFunc(0, func() {
-		ndp.nic.mu.RLock()
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
 		if done {
 			// If we reach this point, it means that the DAD timer fired after
 			// another goroutine already obtained the NIC lock and stopped DAD
 			// before this function obtained the NIC lock. Simply return here and do
 			// nothing further.
-			ndp.nic.mu.RUnlock()
 			return
 		}
 
@@ -665,15 +675,23 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		}
 
 		dadDone := remaining == 0
-		ndp.nic.mu.RUnlock()
 
 		var err *tcpip.Error
 		if !dadDone {
-			err = ndp.sendDADPacket(addr)
+			// Use the unspecified address as the source address when performing DAD.
+			ref := ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+
+			// Do not hold the lock when sending packets which may be a long running
+			// task or may block link address resolution. We know this is safe
+			// because immediately after obtaining the lock again, we check if DAD
+			// has been stopped before doing any work with the NIC. Note, DAD would be
+			// stopped if the NIC was disabled or removed, or if the address was
+			// removed.
+			ndp.nic.mu.Unlock()
+			err = ndp.sendDADPacket(addr, ref)
+			ndp.nic.mu.Lock()
 		}
 
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
 		if done {
 			// If we reach this point, it means that DAD was stopped after we released
 			// the NIC's read lock and before we obtained the write lock.
@@ -721,17 +739,24 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 // addr.
 //
 // addr must be a tentative IPv6 address on ndp's NIC.
-func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
+//
+// The NIC ndp belongs to MUST NOT be locked.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	// Use the unspecified address as the source address when performing DAD.
-	ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
-	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	r := makeRoute(header.IPv6ProtocolNumber, ref.ep.ID().LocalAddress, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
+		// Do not consider the NIC being unknown or disabled as a fatal error.
+		// Since this method is required to be called when the NIC is not locked,
+		// the NIC could have been disabled or removed by another goroutine.
+		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
+			return err
+		}
+
 		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
 	} else if c != nil {
 		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
@@ -750,7 +775,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
 			TOS:      DefaultTOS,
-		}, PacketBuffer{Header: hdr},
+		}, &PacketBuffer{Header: hdr},
 	); err != nil {
 		sent.Dropped.Increment()
 		return err
@@ -1816,7 +1841,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 //
 // The NIC ndp belongs to MUST be locked.
 func (ndp *ndpState) startSolicitingRouters() {
-	if ndp.rtrSolicitTimer != nil {
+	if ndp.rtrSolicit.timer != nil {
 		// We are already soliciting routers.
 		return
 	}
@@ -1833,14 +1858,27 @@ func (ndp *ndpState) startSolicitingRouters() {
 		delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
 	}
 
-	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
+	var done bool
+	ndp.rtrSolicit.done = &done
+	ndp.rtrSolicit.timer = time.AfterFunc(delay, func() {
+		ndp.nic.mu.Lock()
+		if done {
+			// If we reach this point, it means that the RS timer fired after another
+			// goroutine already obtained the NIC lock and stopped solicitations.
+			// Simply return here and do nothing further.
+			ndp.nic.mu.Unlock()
+			return
+		}
+
 		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
 		// to the sending interface, or the unspecified address if no address is
 		// assigned to the sending interface.
-		ref := ndp.nic.primaryIPv6Endpoint(header.IPv6AllRoutersMulticastAddress)
+		ref := ndp.nic.primaryIPv6EndpointRLocked(header.IPv6AllRoutersMulticastAddress)
 		if ref == nil {
-			ref = ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+			ref = ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
 		}
+		ndp.nic.mu.Unlock()
+
 		localAddr := ref.ep.ID().LocalAddress
 		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
@@ -1849,6 +1887,13 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
+			// Do not consider the NIC being unknown or disabled as a fatal error.
+			// Since this method is required to be called when the NIC is not locked,
+			// the NIC could have been disabled or removed by another goroutine.
+			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
+				return
+			}
+
 			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
 		} else if c != nil {
 			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
@@ -1881,7 +1926,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
 				TOS:      DefaultTOS,
-			}, PacketBuffer{Header: hdr},
+			}, &PacketBuffer{Header: hdr},
 		); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
@@ -1893,17 +1938,18 @@ func (ndp *ndpState) startSolicitingRouters() {
 		}
 
 		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
-		if remaining == 0 {
-			ndp.rtrSolicitTimer = nil
-		} else if ndp.rtrSolicitTimer != nil {
+		if done || remaining == 0 {
+			ndp.rtrSolicit.timer = nil
+			ndp.rtrSolicit.done = nil
+		} else if ndp.rtrSolicit.timer != nil {
 			// Note, we need to explicitly check to make sure that
 			// the timer field is not nil because if it was nil but
 			// we still reached this point, then we know the NIC
 			// was requested to stop soliciting routers so we don't
 			// need to send the next Router Solicitation message.
-			ndp.rtrSolicitTimer.Reset(ndp.configs.RtrSolicitationInterval)
+			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
 		}
+		ndp.nic.mu.Unlock()
 	})
 
 }
@@ -1913,13 +1959,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 //
 // The NIC ndp belongs to MUST be locked.
 func (ndp *ndpState) stopSolicitingRouters() {
-	if ndp.rtrSolicitTimer == nil {
+	if ndp.rtrSolicit.timer == nil {
 		// Nothing to do.
 		return
 	}
 
-	ndp.rtrSolicitTimer.Stop()
-	ndp.rtrSolicitTimer = nil
+	*ndp.rtrSolicit.done = true
+	ndp.rtrSolicit.timer.Stop()
+	ndp.rtrSolicit.timer = nil
+	ndp.rtrSolicit.done = nil
 }
 
 // initializeTempAddrState initializes state related to temporary SLAAC
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index b3d174cdd..58f1ebf60 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -613,7 +613,7 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
 				Data: hdr.View().ToVectorisedView(),
 			})
 
@@ -935,7 +935,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 
 // raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
 // and DHCPv6 configurations specified.
-func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
@@ -970,14 +970,14 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 		DstAddr:       header.IPv6AllNodesMulticastAddress,
 	})
 
-	return stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+	return &stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
 }
 
 // raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
 // Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
 }
 
@@ -986,7 +986,7 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 //
 // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
 // DHCPv6 related ones.
-func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) stack.PacketBuffer {
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
 }
 
@@ -994,7 +994,7 @@ func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bo
 //
 // Note, raBuf does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
+func raBuf(ip tcpip.Address, rl uint16) *stack.PacketBuffer {
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
 }
 
@@ -1003,7 +1003,7 @@ func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) stack.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) *stack.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
 		// The OnLink flag is the 7th bit in the flags byte.
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 54103fdb3..644c0d437 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -457,8 +457,20 @@ type ipv6AddrCandidate struct {
 // remoteAddr must be a valid IPv6 address.
 func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
 	n.mu.RLock()
-	defer n.mu.RUnlock()
+	ref := n.primaryIPv6EndpointRLocked(remoteAddr)
+	n.mu.RUnlock()
+	return ref
+}
 
+// primaryIPv6EndpointLocked returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 and 7 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+//
+// n.mu MUST be read locked.
+func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
 	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
 
 	if len(primaryAddrs) == 0 {
@@ -568,11 +580,6 @@ const (
 	// promiscuous indicates that the NIC's promiscuous flag should be observed
 	// when getting a NIC's referenced network endpoint.
 	promiscuous
-
-	// forceSpoofing indicates that the NIC should be assumed to be spoofing,
-	// regardless of what the NIC's spoofing flag is when getting a NIC's
-	// referenced network endpoint.
-	forceSpoofing
 )
 
 func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
@@ -591,8 +598,6 @@ func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.A
 // or spoofing. Promiscuous mode will only be checked if promiscuous is true.
 // Similarly, spoofing will only be checked if spoofing is true.
 func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
-	id := NetworkEndpointID{address}
-
 	n.mu.RLock()
 
 	var spoofingOrPromiscuous bool
@@ -601,11 +606,9 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 		spoofingOrPromiscuous = n.mu.spoofing
 	case promiscuous:
 		spoofingOrPromiscuous = n.mu.promiscuous
-	case forceSpoofing:
-		spoofingOrPromiscuous = true
 	}
 
-	if ref, ok := n.mu.endpoints[id]; ok {
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
 		// An endpoint with this id exists, check if it can be used and return it.
 		switch ref.getKind() {
 		case permanentExpired:
@@ -654,11 +657,18 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// endpoint, create a new "temporary" endpoint. It will only exist while
 	// there's a route through it.
 	n.mu.Lock()
-	if ref, ok := n.mu.endpoints[id]; ok {
+	ref := n.getRefOrCreateTempLocked(protocol, address, peb)
+	n.mu.Unlock()
+	return ref
+}
+
+/// getRefOrCreateTempLocked returns an existing endpoint for address or creates
+/// and returns a temporary endpoint.
+func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
 		// No need to check the type as we are ok with expired endpoints at this
 		// point.
 		if ref.tryIncRef() {
-			n.mu.Unlock()
 			return ref
 		}
 		// tryIncRef failing means the endpoint is scheduled to be removed once the
@@ -670,7 +680,6 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// Add a new temporary endpoint.
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
-		n.mu.Unlock()
 		return nil
 	}
 	ref, _ := n.addAddressLocked(tcpip.ProtocolAddress{
@@ -681,7 +690,6 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 		},
 	}, peb, temporary, static, false)
 
-	n.mu.Unlock()
 	return ref
 }
 
@@ -1153,7 +1161,7 @@ func (n *NIC) isInGroup(addr tcpip.Address) bool {
 	return joins != 0
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt PacketBuffer) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
 
@@ -1167,7 +1175,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	n.mu.RLock()
 	enabled := n.mu.enabled
 	// If the NIC is not yet enabled, don't receive any packets.
@@ -1212,12 +1220,21 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	netHeader, ok := pkt.Data.PullUp(netProto.MinimumPacketSize())
+	// Parse headers.
+	transProtoNum, hasTransportHdr, ok := netProto.Parse(pkt)
 	if !ok {
+		// The packet is too small to contain a network header.
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
-	src, dst := netProto.ParseAddresses(netHeader)
+	if hasTransportHdr {
+		// Parse the transport header if present.
+		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
+			state.proto.Parse(pkt)
+		}
+	}
+
+	src, dst := netProto.ParseAddresses(pkt.NetworkHeader)
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1229,18 +1246,19 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 
 	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
-	if protocol == header.IPv4ProtocolNumber {
+	// Loopback traffic skips the prerouting chain.
+	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
 		// iptables filtering.
 		ipt := n.stack.IPTables()
 		address := n.primaryAddress(protocol)
-		if ok := ipt.Check(Prerouting, &pkt, nil, nil, address.Address, ""); !ok {
+		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
 			// iptables is telling us to drop the packet.
 			return
 		}
 	}
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
+		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
 		return
 	}
 
@@ -1298,24 +1316,37 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen != 0 {
-		pkt.Header = buffer.NewPrependable(linkHeaderLen)
+	// TODO(b/151227689): Avoid copying the packet when forwarding. We can do this
+	// by having lower layers explicity write each header instead of just
+	// pkt.Header.
+
+	// pkt may have set its NetworkHeader and TransportHeader. If we're
+	// forwarding, we'll have to copy them into pkt.Header.
+	pkt.Header = buffer.NewPrependable(int(n.linkEP.MaxHeaderLength()) + len(pkt.NetworkHeader) + len(pkt.TransportHeader))
+	if n := copy(pkt.Header.Prepend(len(pkt.TransportHeader)), pkt.TransportHeader); n != len(pkt.TransportHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.TransportHeader)))
+	}
+	if n := copy(pkt.Header.Prepend(len(pkt.NetworkHeader)), pkt.NetworkHeader); n != len(pkt.NetworkHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.NetworkHeader)))
 	}
 
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
 	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return
 	}
 
 	n.stats.Tx.Packets.Increment()
-	n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 }
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -1329,13 +1360,31 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
-	if !ok {
+	// TransportHeader is nil only when pkt is an ICMP packet or was reassembled
+	// from fragments.
+	if pkt.TransportHeader == nil {
+		// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+		// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+		// full explanation.
+		if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
+			transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
+			if !ok {
+				n.stack.stats.MalformedRcvdPackets.Increment()
+				return
+			}
+			pkt.TransportHeader = transHeader
+		} else {
+			// This is either a bad packet or was re-assembled from fragments.
+			transProto.Parse(pkt)
+		}
+	}
+
+	if len(pkt.TransportHeader) < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader)
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1362,7 +1411,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d672fc157..31f865260 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -15,11 +15,268 @@
 package stack
 
 import (
+	"math"
 	"testing"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+var _ LinkEndpoint = (*testLinkEndpoint)(nil)
+
+// A LinkEndpoint that throws away outgoing packets.
+//
+// We use this instead of the channel endpoint as the channel package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (e *testLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (e *testLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (*testLinkEndpoint) MTU() uint32 {
+	return math.MaxUint16
+}
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*testLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return CapabilityResolutionRequired
+}
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*testLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*testLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements LinkEndpoint.Wait.
+func (*testLinkEndpoint) Wait() {}
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (e *testLinkEndpoint) WritePacket(*Route, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (e *testLinkEndpoint) WritePackets(*Route, *GSO, PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (e *testLinkEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
+
+// An IPv6 NetworkEndpoint that throws away outgoing packets.
+//
+// We use this instead of ipv6.endpoint because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Endpoint struct {
+	nicID     tcpip.NICID
+	id        NetworkEndpointID
+	prefixLen int
+	linkEP    LinkEndpoint
+	protocol  *testIPv6Protocol
+}
+
+// DefaultTTL implements NetworkEndpoint.DefaultTTL.
+func (*testIPv6Endpoint) DefaultTTL() uint8 {
+	return 0
+}
+
+// MTU implements NetworkEndpoint.MTU.
+func (e *testIPv6Endpoint) MTU() uint32 {
+	return e.linkEP.MTU() - header.IPv6MinimumSize
+}
+
+// Capabilities implements NetworkEndpoint.Capabilities.
+func (e *testIPv6Endpoint) Capabilities() LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+// MaxHeaderLength implements NetworkEndpoint.MaxHeaderLength.
+func (e *testIPv6Endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+}
+
+// WritePacket implements NetworkEndpoint.WritePacket.
+func (*testIPv6Endpoint) WritePacket(*Route, *GSO, NetworkHeaderParams, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements NetworkEndpoint.WritePackets.
+func (*testIPv6Endpoint) WritePackets(*Route, *GSO, PacketBufferList, NetworkHeaderParams) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteHeaderIncludedPacket implements
+// NetworkEndpoint.WriteHeaderIncludedPacket.
+func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+// ID implements NetworkEndpoint.ID.
+func (e *testIPv6Endpoint) ID() *NetworkEndpointID {
+	return &e.id
+}
+
+// PrefixLen implements NetworkEndpoint.PrefixLen.
+func (e *testIPv6Endpoint) PrefixLen() int {
+	return e.prefixLen
+}
+
+// NICID implements NetworkEndpoint.NICID.
+func (e *testIPv6Endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+// HandlePacket implements NetworkEndpoint.HandlePacket.
+func (*testIPv6Endpoint) HandlePacket(*Route, *PacketBuffer) {
+}
+
+// Close implements NetworkEndpoint.Close.
+func (*testIPv6Endpoint) Close() {}
+
+// NetworkProtocolNumber implements NetworkEndpoint.NetworkProtocolNumber.
+func (*testIPv6Endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+var _ NetworkProtocol = (*testIPv6Protocol)(nil)
+
+// An IPv6 NetworkProtocol that supports the bare minimum to make a stack
+// believe it supports IPv6.
+//
+// We use this instead of ipv6.protocol because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Protocol struct{}
+
+// Number implements NetworkProtocol.Number.
+func (*testIPv6Protocol) Number() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// MinimumPacketSize implements NetworkProtocol.MinimumPacketSize.
+func (*testIPv6Protocol) MinimumPacketSize() int {
+	return header.IPv6MinimumSize
+}
+
+// DefaultPrefixLen implements NetworkProtocol.DefaultPrefixLen.
+func (*testIPv6Protocol) DefaultPrefixLen() int {
+	return header.IPv6AddressSize * 8
+}
+
+// ParseAddresses implements NetworkProtocol.ParseAddresses.
+func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.IPv6(v)
+	return h.SourceAddress(), h.DestinationAddress()
+}
+
+// NewEndpoint implements NetworkProtocol.NewEndpoint.
+func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &testIPv6Endpoint{
+		nicID:     nicID,
+		id:        NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen: addrWithPrefix.PrefixLen,
+		linkEP:    linkEP,
+		protocol:  p,
+	}, nil
+}
+
+// SetOption implements NetworkProtocol.SetOption.
+func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Option implements NetworkProtocol.Option.
+func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Close implements NetworkProtocol.Close.
+func (*testIPv6Protocol) Close() {}
+
+// Wait implements NetworkProtocol.Wait.
+func (*testIPv6Protocol) Wait() {}
+
+// Parse implements NetworkProtocol.Parse.
+func (*testIPv6Protocol) Parse(*PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	return 0, false, false
+}
+
+var _ LinkAddressResolver = (*testIPv6Protocol)(nil)
+
+// LinkAddressProtocol implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// LinkAddressRequest implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error {
+	return nil
+}
+
+// ResolveStaticAddress implements LinkAddressResolver.
+func (*testIPv6Protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if header.IsV6MulticastAddress(addr) {
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
+	}
+	return "", false
+}
+
+// Test the race condition where a NIC is removed and an RS timer fires at the
+// same time.
+func TestRemoveNICWhileHandlingRSTimer(t *testing.T) {
+	const (
+		nicID = 1
+
+		maxRtrSolicitations = 5
+	)
+
+	e := testLinkEndpoint{}
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{&testIPv6Protocol{}},
+		NDPConfigs: NDPConfigurations{
+			MaxRtrSolicitations:     maxRtrSolicitations,
+			RtrSolicitationInterval: minimumRtrSolicitationInterval,
+		},
+	})
+
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	s.mu.Lock()
+	// Wait for the router solicitation timer to fire and block trying to obtain
+	// the stack lock when doing link address resolution.
+	time.Sleep(minimumRtrSolicitationInterval * 2)
+	if err := s.removeNICLocked(nicID); err != nil {
+		t.Fatalf("s.removeNICLocked(%d) = %s", nicID, err)
+	}
+	s.mu.Unlock()
+}
+
 func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 	// When the NIC is disabled, the only field that matters is the stats field.
 	// This test is limited to stats counter checks.
@@ -44,7 +301,7 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 		t.FailNow()
 	}
 
-	nic.DeliverNetworkPacket(nil, "", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+	nic.DeliverNetworkPacket("", "", 0, &PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
 
 	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
 		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 926df4d7b..1b5da6017 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -24,6 +24,8 @@ import (
 // multiple endpoints. Clone() should be called in such cases so that
 // modifications to the Data field do not affect other copies.
 type PacketBuffer struct {
+	_ noCopy
+
 	// PacketBufferEntry is used to build an intrusive list of
 	// PacketBuffers.
 	PacketBufferEntry
@@ -82,7 +84,32 @@ type PacketBuffer struct {
 // VectorisedView but does not deep copy the underlying bytes.
 //
 // Clone also does not deep copy any of its other fields.
-func (pk PacketBuffer) Clone() PacketBuffer {
-	pk.Data = pk.Data.Clone(nil)
-	return pk
+//
+// FIXME(b/153685824): Data gets copied but not other header references.
+func (pk *PacketBuffer) Clone() *PacketBuffer {
+	return &PacketBuffer{
+		PacketBufferEntry:     pk.PacketBufferEntry,
+		Data:                  pk.Data.Clone(nil),
+		Header:                pk.Header,
+		LinkHeader:            pk.LinkHeader,
+		NetworkHeader:         pk.NetworkHeader,
+		TransportHeader:       pk.TransportHeader,
+		Hash:                  pk.Hash,
+		Owner:                 pk.Owner,
+		EgressRoute:           pk.EgressRoute,
+		GSOOptions:            pk.GSOOptions,
+		NetworkProtocolNumber: pk.NetworkProtocolNumber,
+		NatDone:               pk.NatDone,
+	}
 }
+
+// noCopy may be embedded into structs which must not be copied
+// after the first use.
+//
+// See https://golang.org/issues/8005#issuecomment-190753527
+// for details.
+type noCopy struct{}
+
+// Lock is a no-op used by -copylocks checker from `go vet`.
+func (*noCopy) Lock()   {}
+func (*noCopy) Unlock() {}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index b331427c6..5cbc946b6 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -67,12 +67,12 @@ type TransportEndpoint interface {
 	// this transport endpoint. It sets pkt.TransportHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer)
+	HandlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer)
 
 	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
 	// HandleControlPacket takes ownership of pkt.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer)
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer)
 
 	// Abort initiates an expedited endpoint teardown. It puts the endpoint
 	// in a closed state and frees all resources associated with it. This
@@ -100,7 +100,7 @@ type RawTransportEndpoint interface {
 	// layer up.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -118,7 +118,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -150,7 +150,7 @@ type TransportProtocol interface {
 	// stats purposes only).
 	//
 	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -168,6 +168,11 @@ type TransportProtocol interface {
 
 	// Wait waits for any worker goroutines owned by the protocol to stop.
 	Wait()
+
+	// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
+	// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
+	// MinimumPacketSize()
+	Parse(pkt *PacketBuffer) (ok bool)
 }
 
 // TransportDispatcher contains the methods used by the network stack to deliver
@@ -180,7 +185,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -189,7 +194,7 @@ type TransportDispatcher interface {
 	// DeliverTransportControlPacket.
 	//
 	// DeliverTransportControlPacket takes ownership of pkt.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer)
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -240,17 +245,18 @@ type NetworkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// WritePacket writes a packet to the given destination address and
-	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
-	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error
+	// protocol. It takes ownership of pkt. pkt.TransportHeader must have already
+	// been set.
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
-	// protocol. pkts must not be zero length.
+	// protocol. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
-	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error
+	// header to the given destination address. It takes ownership of pkt.
+	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -265,7 +271,7 @@ type NetworkEndpoint interface {
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -312,6 +318,14 @@ type NetworkProtocol interface {
 
 	// Wait waits for any worker goroutines owned by the protocol to stop.
 	Wait()
+
+	// Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It
+	// returns:
+	// - The encapsulated protocol, if present.
+	// - Whether there is an encapsulated transport protocol payload (e.g. ARP
+	//   does not encapsulate anything).
+	// - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader.
+	Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool)
 }
 
 // NetworkDispatcher contains the methods used by the network stack to deliver
@@ -326,7 +340,7 @@ type NetworkDispatcher interface {
 	// packets sent via loopback), and won't have the field set.
 	//
 	// DeliverNetworkPacket takes ownership of pkt.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -382,17 +396,17 @@ type LinkEndpoint interface {
 	LinkAddress() tcpip.LinkAddress
 
 	// WritePacket writes a packet with the given protocol through the
-	// given route. It sets pkt.LinkHeader if a link layer header exists.
-	// pkt.NetworkHeader and pkt.TransportHeader must have already been
-	// set.
+	// given route. It takes ownership of pkt. pkt.NetworkHeader and
+	// pkt.TransportHeader must have already been set.
 	//
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
-	// given route. pkts must not be zero length.
+	// given route. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
 	//
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
@@ -400,7 +414,7 @@ type LinkEndpoint interface {
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
-	// should already have an ethernet header.
+	// should already have an ethernet header. It takes ownership of vv.
 	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
 
 	// Attach attaches the data link layer endpoint to the network-layer
@@ -430,7 +444,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 150297ab9..d65f8049e 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -113,6 +113,8 @@ func (r *Route) GSOMaxSize() uint32 {
 // If address resolution is required, ErrNoLinkAddress and a notification channel is
 // returned for the top level caller to block. Channel is closed once address resolution
 // is complete (success or not).
+//
+// The NIC r uses must not be locked.
 func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 	if !r.IsResolutionRequired() {
 		// Nothing to do if there is no cache (which does the resolution on cache miss) or
@@ -148,22 +150,27 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 
 // IsResolutionRequired returns true if Resolve() must be called to resolve
 // the link address before the this route can be written to.
+//
+// The NIC r uses must not be locked.
 func (r *Route) IsResolutionRequired() bool {
 	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
 	err := r.ref.ep.WritePacket(r, gso, params, pkt)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
 		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	}
 	return err
 }
@@ -175,9 +182,12 @@ func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHead
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePackets takes ownership of pkt, calculate length first.
+	numPkts := pkts.Len()
+
 	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
 	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
 	}
 	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
 
@@ -193,17 +203,20 @@ func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHead
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(pkt PacketBuffer) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Data.Size()
+
 	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	return nil
 }
 
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index b39ffa9fb..a2190341c 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -52,7 +52,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -235,11 +235,11 @@ type RcvBufAutoTuneParams struct {
 	// was started.
 	MeasureTime time.Time
 
-	// CopiedBytes is the number of bytes copied to user space since
+	// CopiedBytes is the number of bytes copied to userspace since
 	// this measure began.
 	CopiedBytes int
 
-	// PrevCopiedBytes is the number of bytes copied to user space in
+	// PrevCopiedBytes is the number of bytes copied to userspace in
 	// the previous RTT period.
 	PrevCopiedBytes int
 
@@ -424,12 +424,8 @@ type Stack struct {
 	// handleLocal allows non-loopback interfaces to loop packets.
 	handleLocal bool
 
-	// tablesMu protects iptables.
-	tablesMu sync.RWMutex
-
-	// tables are the iptables packet filtering and manipulation rules. The are
-	// protected by tablesMu.`
-	tables IPTables
+	// tables are the iptables packet filtering and manipulation rules.
+	tables *IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
 	// stack is being restored.
@@ -676,6 +672,7 @@ func New(opts Options) *Stack {
 		clock:                clock,
 		stats:                opts.Stats.FillIn(),
 		handleLocal:          opts.HandleLocal,
+		tables:               DefaultTables(),
 		icmpRateLimiter:      NewICMPRateLimiter(),
 		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
@@ -778,7 +775,7 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, PacketBuffer) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, *PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
@@ -1020,6 +1017,13 @@ func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	return s.removeNICLocked(id)
+}
+
+// removeNICLocked removes NIC and all related routes from the network stack.
+//
+// s.mu must be locked.
+func (s *Stack) removeNICLocked(id tcpip.NICID) *tcpip.Error {
 	nic, ok := s.nics[id]
 	if !ok {
 		return tcpip.ErrUnknownNICID
@@ -1400,25 +1404,25 @@ func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
 // nic-specific IDs have precedence over global ones.
-func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
-	return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice)
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
 
 // UnregisterTransportEndpoint removes the endpoint with the given id from the
 // stack transport dispatcher.
-func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
-	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
 
 // StartTransportEndpointCleanup removes the endpoint with the given id from
 // the stack transport dispatcher. It also transitions it to the cleanup stage.
-func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	s.cleanupEndpoints[ep] = struct{}{}
 
-	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
 
 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
@@ -1741,18 +1745,8 @@ func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool,
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() IPTables {
-	s.tablesMu.RLock()
-	t := s.tables
-	s.tablesMu.RUnlock()
-	return t
-}
-
-// SetIPTables sets the stack's iptables.
-func (s *Stack) SetIPTables(ipt IPTables) {
-	s.tablesMu.Lock()
-	s.tables = ipt
-	s.tablesMu.Unlock()
+func (s *Stack) IPTables() *IPTables {
+	return s.tables
 }
 
 // ICMPLimit returns the maximum number of ICMP messages that can be sent
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 1a2cf007c..ffef9bc2c 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -52,6 +52,10 @@ const (
 	// where another value is explicitly used. It is chosen to match the MTU
 	// of loopback interfaces on linux systems.
 	defaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
 )
 
 // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
@@ -90,30 +94,28 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
-	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fakeNetHeaderLen)
-	if !ok {
-		return
-	}
-	pkt.Data.TrimFront(fakeNetHeaderLen)
-
 	// Handle control packets.
-	if b[2] == uint8(fakeControlProtocol) {
+	if pkt.NetworkHeader[protocolNumberOffset] == uint8(fakeControlProtocol) {
 		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
 		if !ok {
 			return
 		}
 		pkt.Data.TrimFront(fakeNetHeaderLen)
-		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
+		f.dispatcher.DeliverTransportControlPacket(
+			tcpip.Address(nb[srcAddrOffset:srcAddrOffset+1]),
+			tcpip.Address(nb[dstAddrOffset:dstAddrOffset+1]),
+			fakeNetNumber,
+			tcpip.TransportProtocolNumber(nb[protocolNumberOffset]),
+			stack.ControlPortUnreachable, 0, pkt)
 		return
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -132,24 +134,19 @@ func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumbe
 	return f.proto.Number()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
-	b := pkt.Header.Prepend(fakeNetHeaderLen)
-	b[0] = r.RemoteAddress[0]
-	b[1] = f.id.LocalAddress[0]
-	b[2] = byte(params.Protocol)
+	pkt.NetworkHeader = pkt.Header.Prepend(fakeNetHeaderLen)
+	pkt.NetworkHeader[dstAddrOffset] = r.RemoteAddress[0]
+	pkt.NetworkHeader[srcAddrOffset] = f.id.LocalAddress[0]
+	pkt.NetworkHeader[protocolNumberOffset] = byte(params.Protocol)
 
 	if r.Loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
-		f.HandlePacket(r, stack.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
+		f.HandlePacket(r, pkt)
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
@@ -163,7 +160,7 @@ func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -205,7 +202,7 @@ func (f *fakeNetworkProtocol) PacketCount(intfAddr byte) int {
 }
 
 func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
 func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
@@ -247,6 +244,17 @@ func (*fakeNetworkProtocol) Close() {}
 // Wait implements TransportProtocol.Wait.
 func (*fakeNetworkProtocol) Wait() {}
 
+// Parse implements TransportProtocol.Parse.
+func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	hdr, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(fakeNetHeaderLen)
+	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
+}
+
 func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
@@ -292,8 +300,8 @@ func TestNetworkReceive(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Make sure packet with wrong address is not delivered.
-	buf[0] = 3
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 0 {
@@ -304,8 +312,8 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to first endpoint.
-	buf[0] = 1
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 1
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -316,8 +324,8 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to second endpoint.
-	buf[0] = 2
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 2
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -328,7 +336,7 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.InjectInbound(fakeNetNumber-1, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber-1, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -340,7 +348,7 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeNet.packetCount[1] != 1 {
@@ -362,7 +370,7 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
 	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   payload.ToVectorisedView(),
 	})
@@ -420,7 +428,7 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
@@ -982,7 +990,7 @@ func TestAddressRemoval(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSendTo(t, s, remoteAddr, ep, nil)
 
@@ -1032,7 +1040,7 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	}
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSend(t, r, ep, nil)
 	testSendTo(t, s, remoteAddr, ep, nil)
@@ -1114,7 +1122,7 @@ func TestEndpointExpiration(t *testing.T) {
 
 				fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
 				buf := buffer.NewView(30)
-				buf[0] = localAddrByte
+				buf[dstAddrOffset] = localAddrByte
 
 				if promiscuous {
 					if err := s.SetPromiscuousMode(nicID, true); err != nil {
@@ -1277,7 +1285,7 @@ func TestPromiscuousMode(t *testing.T) {
 	// Write a packet, and check that it doesn't get delivered as we don't
 	// have a matching endpoint.
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
 
 	// Set promiscuous mode, then check that packet is delivered.
@@ -1658,7 +1666,7 @@ func TestAddressRangeAcceptsMatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x00"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -1766,7 +1774,7 @@ func TestAddressRangeRejectsNonmatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x10"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -2263,7 +2271,7 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
@@ -2344,8 +2352,8 @@ func TestNICForwarding(t *testing.T) {
 
 			// Send a packet to dstAddr.
 			buf := buffer.NewView(30)
-			buf[0] = dstAddr[0]
-			ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+			buf[dstAddrOffset] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 				Data: buf.ToVectorisedView(),
 			})
 
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 9a33ed375..118b449d5 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -15,7 +15,6 @@
 package stack
 
 import (
-	"container/heap"
 	"fmt"
 	"math/rand"
 
@@ -23,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 )
 
 type protocolIDs struct {
@@ -43,14 +43,14 @@ type transportEndpoints struct {
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if !epsByNIC.unregisterEndpoint(bindToDevice, ep) {
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) {
 		return
 	}
 	delete(eps.endpoints, id)
@@ -152,7 +152,7 @@ func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 
 	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
@@ -183,7 +183,7 @@ func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, p
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 	defer epsByNIC.mu.RUnlock()
 
@@ -204,7 +204,7 @@ func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpoint
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	epsByNIC.mu.Lock()
 	defer epsByNIC.mu.Unlock()
 
@@ -214,23 +214,22 @@ func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto t
 			demux:      d,
 			netProto:   netProto,
 			transProto: transProto,
-			reuse:      reusePort,
 		}
 		epsByNIC.endpoints[bindToDevice] = multiPortEp
 	}
 
-	return multiPortEp.singleRegisterEndpoint(t, reusePort)
+	return multiPortEp.singleRegisterEndpoint(t, flags)
 }
 
 // unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
-func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool {
 	epsByNIC.mu.Lock()
 	defer epsByNIC.mu.Unlock()
 	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		return false
 	}
-	if multiPortEp.unregisterEndpoint(t) {
+	if multiPortEp.unregisterEndpoint(t, flags) {
 		delete(epsByNIC.endpoints, bindToDevice)
 	}
 	return len(epsByNIC.endpoints) == 0
@@ -251,7 +250,7 @@ type transportDemuxer struct {
 // the dispatcher to delivery packets to the QueuePacket method instead of
 // calling HandlePacket directly on the endpoint.
 type queuedTransportProtocol interface {
-	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt PacketBuffer)
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
@@ -279,10 +278,10 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 
 // registerEndpoint registers the given endpoint with the dispatcher such that
 // packets that match the endpoint ID are delivered to it.
-func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	for i, n := range netProtos {
-		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort, bindToDevice); err != nil {
-			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, bindToDevice)
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice)
 			return err
 		}
 	}
@@ -290,35 +289,6 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 	return nil
 }
 
-type transportEndpointHeap []TransportEndpoint
-
-var _ heap.Interface = (*transportEndpointHeap)(nil)
-
-func (h *transportEndpointHeap) Len() int {
-	return len(*h)
-}
-
-func (h *transportEndpointHeap) Less(i, j int) bool {
-	return (*h)[i].UniqueID() < (*h)[j].UniqueID()
-}
-
-func (h *transportEndpointHeap) Swap(i, j int) {
-	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
-}
-
-func (h *transportEndpointHeap) Push(x interface{}) {
-	*h = append(*h, x.(TransportEndpoint))
-}
-
-func (h *transportEndpointHeap) Pop() interface{} {
-	old := *h
-	n := len(old)
-	x := old[n-1]
-	old[n-1] = nil
-	*h = old[:n-1]
-	return x
-}
-
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
 // the same pair of address and port. endpointsArr always has at least one
 // element.
@@ -334,9 +304,10 @@ type multiPortEndpoint struct {
 	netProto   tcpip.NetworkProtocolNumber
 	transProto tcpip.TransportProtocolNumber
 
-	endpoints transportEndpointHeap
-	// reuse indicates if more than one endpoint is allowed.
-	reuse bool
+	// endpoints stores the transport endpoints in the order in which they
+	// were bound. This is required for UDP SO_REUSEADDR.
+	endpoints []TransportEndpoint
+	flags     ports.FlagCounter
 }
 
 func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
@@ -362,6 +333,10 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 		return mpep.endpoints[0]
 	}
 
+	if mpep.flags.IntersectionRefs().ToFlags().Effective().MostRecent {
+		return mpep.endpoints[len(mpep.endpoints)-1]
+	}
+
 	payload := []byte{
 		byte(id.LocalPort),
 		byte(id.LocalPort >> 8),
@@ -379,7 +354,7 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpoints[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	// HandlePacket takes ownership of pkt, so each endpoint needs
@@ -401,40 +376,47 @@ func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, p
 
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
 // list. The list might be empty already.
-func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) *tcpip.Error {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
+	bits := flags.Bits()
+
 	if len(ep.endpoints) != 0 {
 		// If it was previously bound, we need to check if we can bind again.
-		if !ep.reuse || !reusePort {
+		if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
 			return tcpip.ErrPortInUse
 		}
 	}
 
-	heap.Push(&ep.endpoints, t)
+	ep.endpoints = append(ep.endpoints, t)
+	ep.flags.AddRef(bits)
 
 	return nil
 }
 
 // unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
-func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
 	for i, endpoint := range ep.endpoints {
 		if endpoint == t {
-			heap.Remove(&ep.endpoints, i)
+			copy(ep.endpoints[i:], ep.endpoints[i+1:])
+			ep.endpoints[len(ep.endpoints)-1] = nil
+			ep.endpoints = ep.endpoints[:len(ep.endpoints)-1]
+
+			ep.flags.DropRef(flags.Bits())
 			break
 		}
 	}
 	return len(ep.endpoints) == 0
 }
 
-func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	if id.RemotePort != 0 {
-		// TODO(eyalsoha): Why?
-		reusePort = false
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
 	}
 
 	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
@@ -454,15 +436,20 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 		eps.endpoints[id] = epsByNIC
 	}
 
-	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
 	for _, n := range netProtos {
 		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
-			eps.unregisterEndpoint(id, ep, bindToDevice)
+			eps.unregisterEndpoint(id, ep, flags, bindToDevice)
 		}
 	}
 }
@@ -470,7 +457,7 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -520,7 +507,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -544,7 +531,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 2474a7db3..73dada928 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -127,7 +128,7 @@ func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
@@ -165,7 +166,7 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
 		Data:            buf.ToVectorisedView(),
 		NetworkHeader:   buffer.View(ip),
 		TransportHeader: buffer.View(u),
@@ -195,7 +196,7 @@ func TestTransportDemuxerRegister(t *testing.T) {
 			if !ok {
 				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
 			}
-			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, ports.Flags{}, 0), test.want; got != want {
 				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
 			}
 		})
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a611e44ab..7e8b84867 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -83,12 +84,13 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 		return 0, nil, tcpip.ErrNoRoute
 	}
 
-	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()))
+	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()) + fakeTransHeaderLen)
+	hdr.Prepend(fakeTransHeaderLen)
 	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header: hdr,
 		Data:   buffer.View(v).ToVectorisedView(),
 	}); err != nil {
@@ -153,7 +155,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.ID.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, false /* reuse */, 0 /* bindToDevice */)
+	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
 	if err != nil {
 		return err
 	}
@@ -198,8 +200,8 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
 		fakeTransNumber,
 		stack.TransportEndpointID{LocalAddress: a.Addr},
 		f,
-		false, /* reuse */
-		0,     /* bindtoDevice */
+		ports.Flags{},
+		0, /* bindtoDevice */
 	); err != nil {
 		return err
 	}
@@ -215,7 +217,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ stack.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ *stack.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -232,7 +234,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, stack.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -289,7 +291,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
@@ -324,6 +326,17 @@ func (*fakeTransportProtocol) Close() {}
 // Wait implements TransportProtocol.Wait.
 func (*fakeTransportProtocol) Wait() {}
 
+// Parse implements TransportProtocol.Parse.
+func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(fakeTransHeaderLen)
+	if !ok {
+		return false
+	}
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(fakeTransHeaderLen)
+	return true
+}
+
 func fakeTransFactory() stack.TransportProtocol {
 	return &fakeTransportProtocol{}
 }
@@ -369,7 +382,7 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -380,7 +393,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 0 {
@@ -391,7 +404,7 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.packetCount != 1 {
@@ -446,7 +459,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -457,7 +470,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 0 {
@@ -468,7 +481,7 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 	if fakeTrans.controlCount != 1 {
@@ -623,7 +636,7 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep2.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
 		Data: req.ToVectorisedView(),
 	})
 
@@ -642,11 +655,10 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	hdrs := p.Pkt.Data.ToView()
-	if dst := hdrs[0]; dst != 3 {
+	if dst := p.Pkt.NetworkHeader[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := hdrs[1]; src != 1 {
+	if src := p.Pkt.NetworkHeader[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index 45e930ad8..b7b227328 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -110,6 +110,71 @@ var (
 	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
 )
 
+var messageToError map[string]*Error
+
+var populate sync.Once
+
+// StringToError converts an error message to the error.
+func StringToError(s string) *Error {
+	populate.Do(func() {
+		var errors = []*Error{
+			ErrUnknownProtocol,
+			ErrUnknownNICID,
+			ErrUnknownDevice,
+			ErrUnknownProtocolOption,
+			ErrDuplicateNICID,
+			ErrDuplicateAddress,
+			ErrNoRoute,
+			ErrBadLinkEndpoint,
+			ErrAlreadyBound,
+			ErrInvalidEndpointState,
+			ErrAlreadyConnecting,
+			ErrAlreadyConnected,
+			ErrNoPortAvailable,
+			ErrPortInUse,
+			ErrBadLocalAddress,
+			ErrClosedForSend,
+			ErrClosedForReceive,
+			ErrWouldBlock,
+			ErrConnectionRefused,
+			ErrTimeout,
+			ErrAborted,
+			ErrConnectStarted,
+			ErrDestinationRequired,
+			ErrNotSupported,
+			ErrQueueSizeNotSupported,
+			ErrNotConnected,
+			ErrConnectionReset,
+			ErrConnectionAborted,
+			ErrNoSuchFile,
+			ErrInvalidOptionValue,
+			ErrNoLinkAddress,
+			ErrBadAddress,
+			ErrNetworkUnreachable,
+			ErrMessageTooLong,
+			ErrNoBufferSpace,
+			ErrBroadcastDisabled,
+			ErrNotPermitted,
+			ErrAddressFamilyNotSupported,
+		}
+
+		messageToError = make(map[string]*Error)
+		for _, e := range errors {
+			if messageToError[e.String()] != nil {
+				panic("tcpip errors with duplicated message: " + e.String())
+			}
+			messageToError[e.String()] = e
+		}
+	})
+
+	e, ok := messageToError[s]
+	if !ok {
+		panic("unknown error message: " + s)
+	}
+
+	return e
+}
+
 // Errors related to Subnet
 var (
 	errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 2f98a996f..7f172f978 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 9ce625c17..7e5c79776 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index b1d820372..8ce294002 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -110,7 +111,7 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, 0 /* bindToDevice */)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, 0 /* bindToDevice */)
 	}
 
 	// Close the receive list and drain it.
@@ -140,11 +141,6 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
@@ -450,7 +446,7 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpi
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data.ToVectorisedView(),
 		TransportHeader: buffer.View(icmpv4),
@@ -481,7 +477,7 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            dataVV,
 		TransportHeader: buffer.View(icmpv6),
@@ -511,6 +507,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	nicID := addr.NIC
 	localPort := uint16(0)
 	switch e.state {
+	case stateInitial:
 	case stateBound, stateConnected:
 		localPort = e.ID.LocalPort
 		if e.BindNICID == 0 {
@@ -611,14 +608,14 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindToDevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindToDevice */)
 		return id, err
 	}
 
 	// We need to find a port for the endpoint.
 	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
 		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindtodevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindtodevice */)
 		switch err {
 		case nil:
 			return true, nil
@@ -743,7 +740,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
@@ -805,7 +802,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 3c47692b2..74ef6541e 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
@@ -124,6 +124,16 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	// TODO(gvisor.dev/issue/170): Implement parsing of ICMP.
+	//
+	// Right now, the Parse() method is tied to enabled protocols passed into
+	// stack.New. This works for UDP and TCP, but we handle ICMP traffic even
+	// when netstack users don't pass ICMP as a supported protocol.
+	return false
+}
+
 // NewProtocol4 returns an ICMPv4 transport protocol.
 func NewProtocol4() stack.TransportProtocol {
 	return &protocol{ProtocolNumber4}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 23158173d..baf08eda6 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -132,11 +132,6 @@ func (ep *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (ep *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (ep *endpoint) IPTables() (stack.IPTables, error) {
-	return ep.stack.IPTables(), nil
-}
-
 // Read implements tcpip.Endpoint.Read.
 func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	ep.rcvMu.Lock()
@@ -298,7 +293,7 @@ func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index eee754a5a..a406d815e 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -166,11 +166,6 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read implements tcpip.Endpoint.Read.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	if !e.associated {
@@ -348,7 +343,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
 		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(stack.PacketBuffer{
+			if err := route.WriteHeaderIncludedPacket(&stack.PacketBuffer{
 				Data: buffer.View(payloadBytes).ToVectorisedView(),
 			}); err != nil {
 				return 0, nil, err
@@ -357,7 +352,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 		}
 
 		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
 			Header: hdr,
 			Data:   buffer.View(payloadBytes).ToVectorisedView(),
 			Owner:  e.owner,
@@ -584,7 +579,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -632,8 +627,9 @@ func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
 		},
 	}
 
-	networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-	combinedVV := networkHeader.ToVectorisedView()
+	headers := append(buffer.View(nil), pkt.NetworkHeader...)
+	headers = append(headers, pkt.TransportHeader...)
+	combinedVV := headers.ToVectorisedView()
 	combinedVV.Append(pkt.Data)
 	packet.data = combinedVV
 	packet.timestampNS = e.stack.NowNanoseconds()
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index f38eb6833..e26f01fae 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -86,10 +86,6 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    # FIXME(b/68809571)
-    tags = [
-        "flaky",
-    ],
     deps = [
         ":tcp",
         "//pkg/sync",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index e6a23c978..ad197e8db 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -27,6 +27,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -238,13 +239,14 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	n.mu.Lock()
 
 	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
+	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, ports.Flags{LoadBalanced: n.reusePort}, n.boundBindToDevice); err != nil {
 		n.mu.Unlock()
 		n.Close()
 		return nil, err
 	}
 
 	n.isRegistered = true
+	n.registeredReusePort = n.reusePort
 
 	return n, nil
 }
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a7e088d4e..7da93dcc4 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -833,13 +833,13 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac
 		return sendTCPBatch(r, tf, data, gso, owner)
 	}
 
-	pkt := stack.PacketBuffer{
+	pkt := &stack.PacketBuffer{
 		Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
 		Data:   data,
 		Hash:   tf.txHash,
 		Owner:  owner,
 	}
-	buildTCPHdr(r, tf, &pkt, gso)
+	buildTCPHdr(r, tf, pkt, gso)
 
 	if tf.ttl == 0 {
 		tf.ttl = r.DefaultTTL()
@@ -1347,6 +1347,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 			e.setEndpointState(StateError)
 			e.HardError = err
 
+			e.workerCleanup = true
 			// Lock released below.
 			epilogue()
 			return err
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index 6062ca916..047704c80 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -186,7 +186,7 @@ func (d *dispatcher) wait() {
 	}
 }
 
-func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
 	if !s.parse() {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 71735029e..6e4d607da 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -63,7 +63,8 @@ const (
 	StateClosing
 )
 
-// connected is the set of states where an endpoint is connected to a peer.
+// connected returns true when s is one of the states representing an
+// endpoint connected to a peer.
 func (s EndpointState) connected() bool {
 	switch s {
 	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
@@ -73,6 +74,40 @@ func (s EndpointState) connected() bool {
 	}
 }
 
+// connecting returns true when s is one of the states representing a
+// connection in progress, but not yet fully established.
+func (s EndpointState) connecting() bool {
+	switch s {
+	case StateConnecting, StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// handshake returns true when s is one of the states representing an endpoint
+// in the middle of a TCP handshake.
+func (s EndpointState) handshake() bool {
+	switch s {
+	case StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// closed returns true when s is one of the states an endpoint transitions to
+// when closed or when it encounters an error. This is distinct from a newly
+// initialized endpoint that was never connected.
+func (s EndpointState) closed() bool {
+	switch s {
+	case StateClose, StateError:
+		return true
+	default:
+		return false
+	}
+}
+
 // String implements fmt.Stringer.String.
 func (s EndpointState) String() string {
 	switch s {
@@ -430,6 +465,10 @@ type endpoint struct {
 	// reusePort is set to true if SO_REUSEPORT is enabled.
 	reusePort bool
 
+	// registeredReusePort is set if the current endpoint registration was
+	// done with SO_REUSEPORT enabled.
+	registeredReusePort bool
+
 	// bindToDevice is set to the NIC on which to bind or disabled if 0.
 	bindToDevice tcpip.NICID
 
@@ -986,8 +1025,9 @@ func (e *endpoint) closeNoShutdownLocked() {
 	// in Listen() when trying to register.
 	if e.EndpointState() == StateListen && e.isPortReserved {
 		if e.isRegistered {
-			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, ports.Flags{LoadBalanced: e.registeredReusePort}, e.boundBindToDevice)
 			e.isRegistered = false
+			e.registeredReusePort = false
 		}
 
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
@@ -1051,8 +1091,9 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, ports.Flags{LoadBalanced: e.registeredReusePort}, e.boundBindToDevice)
 		e.isRegistered = false
+		e.registeredReusePort = false
 	}
 
 	if e.isPortReserved {
@@ -1097,7 +1138,7 @@ func (e *endpoint) initialReceiveWindow() int {
 }
 
 // ModerateRecvBuf adjusts the receive buffer and the advertised window
-// based on the number of bytes copied to user space.
+// based on the number of bytes copied to userspace.
 func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.LockUser()
 	defer e.UnlockUser()
@@ -1172,11 +1213,6 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.LockUser()
@@ -2058,10 +2094,11 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	if e.ID.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, ports.Flags{LoadBalanced: e.reusePort}, e.boundBindToDevice)
 		if err != nil {
 			return err
 		}
+		e.registeredReusePort = e.reusePort
 	} else {
 		// The endpoint doesn't have a local port yet, so try to get
 		// one. Make sure that it isn't one that will result in the same
@@ -2093,12 +2130,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 			id := e.ID
 			id.LocalPort = p
-			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
+			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, ports.Flags{LoadBalanced: e.reusePort}, e.bindToDevice) {
 			case nil:
 				// Port picking successful. Save the details of
 				// the selected port.
 				e.ID = id
 				e.boundBindToDevice = e.bindToDevice
+				e.registeredReusePort = e.reusePort
 				return true, nil
 			case tcpip.ErrPortInUse:
 				return false, nil
@@ -2296,12 +2334,13 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Register the endpoint.
-	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice); err != nil {
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, ports.Flags{LoadBalanced: e.reusePort}, e.boundBindToDevice); err != nil {
 		return err
 	}
 
 	e.isRegistered = true
 	e.setEndpointState(StateListen)
+	e.registeredReusePort = e.reusePort
 
 	// The channel may be non-nil when we're restoring the endpoint, and it
 	// may be pre-populated with some previously accepted (but not Accepted)
@@ -2462,7 +2501,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
 	// land at the Dispatcher which then can either delivery using the
 	// worker go routine or directly do the invoke the tcp processing inline
@@ -2481,7 +2520,7 @@ func (e *endpoint) enqueueSegment(s *segment) bool {
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 8b7562396..cbb779666 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -49,11 +49,10 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.EndpointState() {
-	case StateInitial, StateBound:
-		// TODO(b/138137272): this enumeration duplicates
-		// EndpointState.connected. remove it.
-	case StateEstablished, StateSynSent, StateSynRecv, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	epState := e.EndpointState()
+	switch {
+	case epState == StateInitial || epState == StateBound:
+	case epState.connected() || epState.handshake():
 		if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
 			if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
 				panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort)})
@@ -69,15 +68,16 @@ func (e *endpoint) beforeSave() {
 			break
 		}
 		fallthrough
-	case StateListen, StateConnecting:
+	case epState == StateListen || epState == StateConnecting:
 		e.drainSegmentLocked()
-		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
+		// Refresh epState, since drainSegmentLocked may have changed it.
+		epState = e.EndpointState()
+		if !epState.closed() {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
-			break
 		}
-	case StateError, StateClose:
+	case epState.closed():
 		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
@@ -148,23 +148,23 @@ var connectingLoading sync.WaitGroup
 // Bound endpoint loading happens last.
 
 // loadState is invoked by stateify.
-func (e *endpoint) loadState(state EndpointState) {
+func (e *endpoint) loadState(epState EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
 	// For restore purposes we treat TimeWait like a connected endpoint.
-	if state.connected() || state == StateTimeWait {
+	if epState.connected() || epState == StateTimeWait {
 		connectedLoading.Add(1)
 	}
-	switch state {
-	case StateListen:
+	switch {
+	case epState == StateListen:
 		listenLoading.Add(1)
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		connectingLoading.Add(1)
 	}
 	// Directly update the state here rather than using e.setEndpointState
 	// as the endpoint is still being loaded and the stack reference is not
 	// yet initialized.
-	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+	atomic.StoreUint32((*uint32)(&e.state), uint32(epState))
 }
 
 // afterLoad is invoked by stateify.
@@ -183,8 +183,8 @@ func (e *endpoint) afterLoad() {
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	state := e.origEndpointState
-	switch state {
+	epState := e.origEndpointState
+	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
@@ -208,8 +208,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	switch state {
-	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	switch {
+	case epState.connected():
 		bind()
 		if len(e.connectingAddress) == 0 {
 			e.connectingAddress = e.ID.RemoteAddress
@@ -232,13 +232,13 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		closed := e.closed
 		e.mu.Unlock()
 		e.notifyProtocolGoroutine(notifyTickleWorker)
-		if state == StateFinWait2 && closed {
+		if epState == StateFinWait2 && closed {
 			// If the endpoint has been closed then make sure we notify so
 			// that the FIN_WAIT2 timer is started after a restore.
 			e.notifyProtocolGoroutine(notifyClose)
 		}
 		connectedLoading.Done()
-	case StateListen:
+	case epState == StateListen:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -255,7 +255,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -267,7 +267,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			connectingLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateBound:
+	case epState == StateBound:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -276,7 +276,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			bind()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateClose:
+	case epState == StateClose:
 		if e.isPortReserved {
 			tcpip.AsyncLoading.Add(1)
 			go func() {
@@ -291,12 +291,11 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		e.state = StateClose
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
-	case StateError:
+	case epState == StateError:
 		e.state = StateError
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
-
 }
 
 // saveLastError is invoked by stateify.
@@ -314,7 +313,7 @@ func (e *endpoint) loadLastError(s string) {
 		return
 	}
 
-	e.lastError = loadError(s)
+	e.lastError = tcpip.StringToError(s)
 }
 
 // saveHardError is invoked by stateify.
@@ -332,71 +331,7 @@ func (e *EndpointInfo) loadHardError(s string) {
 		return
 	}
 
-	e.HardError = loadError(s)
-}
-
-var messageToError map[string]*tcpip.Error
-
-var populate sync.Once
-
-func loadError(s string) *tcpip.Error {
-	populate.Do(func() {
-		var errors = []*tcpip.Error{
-			tcpip.ErrUnknownProtocol,
-			tcpip.ErrUnknownNICID,
-			tcpip.ErrUnknownDevice,
-			tcpip.ErrUnknownProtocolOption,
-			tcpip.ErrDuplicateNICID,
-			tcpip.ErrDuplicateAddress,
-			tcpip.ErrNoRoute,
-			tcpip.ErrBadLinkEndpoint,
-			tcpip.ErrAlreadyBound,
-			tcpip.ErrInvalidEndpointState,
-			tcpip.ErrAlreadyConnecting,
-			tcpip.ErrAlreadyConnected,
-			tcpip.ErrNoPortAvailable,
-			tcpip.ErrPortInUse,
-			tcpip.ErrBadLocalAddress,
-			tcpip.ErrClosedForSend,
-			tcpip.ErrClosedForReceive,
-			tcpip.ErrWouldBlock,
-			tcpip.ErrConnectionRefused,
-			tcpip.ErrTimeout,
-			tcpip.ErrAborted,
-			tcpip.ErrConnectStarted,
-			tcpip.ErrDestinationRequired,
-			tcpip.ErrNotSupported,
-			tcpip.ErrQueueSizeNotSupported,
-			tcpip.ErrNotConnected,
-			tcpip.ErrConnectionReset,
-			tcpip.ErrConnectionAborted,
-			tcpip.ErrNoSuchFile,
-			tcpip.ErrInvalidOptionValue,
-			tcpip.ErrNoLinkAddress,
-			tcpip.ErrBadAddress,
-			tcpip.ErrNetworkUnreachable,
-			tcpip.ErrMessageTooLong,
-			tcpip.ErrNoBufferSpace,
-			tcpip.ErrBroadcastDisabled,
-			tcpip.ErrNotPermitted,
-			tcpip.ErrAddressFamilyNotSupported,
-		}
-
-		messageToError = make(map[string]*tcpip.Error)
-		for _, e := range errors {
-			if messageToError[e.String()] != nil {
-				panic("tcpip errors with duplicated message: " + e.String())
-			}
-			messageToError[e.String()] = e
-		}
-	})
-
-	e, ok := messageToError[s]
-	if !ok {
-		panic("unknown error message: " + s)
-	}
-
-	return e
+	e.HardError = tcpip.StringToError(s)
 }
 
 // saveMeasureTime is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 704d01c64..070b634b4 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -61,7 +61,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2a2a7ddeb..73b8a6782 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,6 +21,7 @@
 package tcp
 
 import (
+	"fmt"
 	"runtime"
 	"strings"
 	"time"
@@ -206,7 +207,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // to a specific processing queue. Each queue is serviced by its own processor
 // goroutine which is responsible for dequeuing and doing full TCP dispatch of
 // the packet.
-func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	p.dispatcher.queuePacket(r, ep, id, pkt)
 }
 
@@ -217,7 +218,7 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
@@ -490,6 +491,26 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter {
 	return &p.synRcvdCount
 }
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(offset)
+		if !ok {
+			panic(fmt.Sprintf("There should be at least %d bytes in pkt.Data.", offset))
+		}
+	}
+
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	return true
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 074edded6..0280892a8 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -35,6 +35,7 @@ type segment struct {
 	id     stack.TransportEndpointID `state:"manual"`
 	route  stack.Route               `state:"manual"`
 	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -60,13 +61,14 @@ type segment struct {
 	xmitCount uint32
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
 		route:  r.Clone(),
 	}
 	s.data = pkt.Data.Clone(s.views[:])
+	s.hdr = header.TCP(pkt.TransportHeader)
 	s.rcvdTime = time.Now()
 	return s
 }
@@ -146,12 +148,6 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h, ok := s.data.PullUp(header.TCPMinimumSize)
-	if !ok {
-		return false
-	}
-	hdr := header.TCP(h)
-
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
 	// 1. That it's at least the minimum header size; if we don't do this
@@ -162,16 +158,12 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(hdr.DataOffset())
-	if offset < header.TCPMinimumSize {
-		return false
-	}
-	hdrWithOpts, ok := s.data.PullUp(offset)
-	if !ok {
+	offset := int(s.hdr.DataOffset())
+	if offset < header.TCPMinimumSize || offset > len(s.hdr) {
 		return false
 	}
 
-	s.options = []byte(hdrWithOpts[header.TCPMinimumSize:])
+	s.options = []byte(s.hdr[header.TCPMinimumSize:])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -180,22 +172,19 @@ func (s *segment) parse() bool {
 	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
 		s.csumValid = true
 		verifyChecksum = false
-		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		hdr = header.TCP(hdrWithOpts)
-		s.csum = hdr.Checksum()
-		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = hdr.CalculateChecksum(xsum)
-		s.data.TrimFront(offset)
+		s.csum = s.hdr.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()+len(s.hdr)))
+		xsum = s.hdr.CalculateChecksum(xsum)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(hdr.SequenceNumber())
-	s.ackNumber = seqnum.Value(hdr.AckNumber())
-	s.flags = hdr.Flags()
-	s.window = seqnum.Size(hdr.WindowSize())
+	s.sequenceNumber = seqnum.Value(s.hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(s.hdr.AckNumber())
+	s.flags = s.hdr.Flags()
+	s.window = seqnum.Size(s.hdr.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 06dc9b7d7..acacb42e4 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -618,6 +618,20 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
+
+	// The segment being split does not carry PUSH flag because it is
+	// followed by the newly split segment.
+	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
+	// segment (i.e., when there is no more queued data to be sent).
+	// Linux removes PSH flag only when the segment is being split over MSS
+	// and retains it when we are splitting the segment over lack of sender
+	// window space.
+	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
+	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
+	if seg.data.Size() > s.maxPayloadSize {
+		seg.flags ^= header.TCPFlagPsh
+	}
+
 	seg.data.CapLength(size)
 }
 
@@ -739,7 +753,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	if !s.isAssignedSequenceNumber(seg) {
 		// Merge segments if allowed.
 		if seg.data.Size() != 0 {
-			available := int(seg.sequenceNumber.Size(end))
+			available := int(s.sndNxt.Size(end))
 			if available > limit {
 				available = limit
 			}
@@ -782,8 +796,11 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 					//   sent all at once.
 					return false
 				}
-				if atomic.LoadUint32(&s.ep.cork) != 0 {
-					// Hold back the segment until full.
+				// With TCP_CORK, hold back until minimum of the available
+				// send space and MSS.
+				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
+				// timeout.
+				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
 					return false
 				}
 			}
@@ -816,6 +833,25 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			panic("Netstack queues FIN segments without data.")
 		}
 
+		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+		// If the entire segment cannot be accomodated in the receiver
+		// advertized window, skip splitting and sending of the segment.
+		// ref: net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered
+		// by a probe timer. On this condition, it defers the segment
+		// split and transmit to a short probe timer.
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split the
+		// segment right here if there are no pending segments.
+		// If there are pending segments, segment transmits are deferred
+		// to the retransmit timer handler.
+		if s.sndUna != s.sndNxt && !segEnd.LessThan(end) {
+			return false
+		}
+
 		if !seg.sequenceNumber.LessThan(end) {
 			return false
 		}
@@ -824,9 +860,17 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		if available == 0 {
 			return false
 		}
+
+		// The segment size limit is computed as a function of sender congestion
+		// window and MSS. When sender congestion window is > 1, this limit can
+		// be larger than MSS. Ensure that the currently available send space
+		// is not greater than minimum of this limit and MSS.
 		if available > limit {
 			available = limit
 		}
+		if available > s.maxPayloadSize {
+			available = s.maxPayloadSize
+		}
 
 		if seg.data.Size() > available {
 			s.splitSeg(seg, available)
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 0b4512c65..6ef32a1b3 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -5869,7 +5869,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 		// Invoke the moderation API. This is required for auto-tuning
 		// to happen. This method is normally expected to be invoked
 		// from a higher layer than tcpip.Endpoint. So we simulate
-		// copying to user-space by invoking it explicitly here.
+		// copying to userspace by invoking it explicitly here.
 		c.EP.ModerateRecvBuf(totalCopied)
 
 		// Now send a keep-alive packet to trigger an ACK so that we can
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 7b1d72cf4..9721f6caf 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -316,7 +316,7 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
@@ -372,7 +372,7 @@ func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcp
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: s,
 	})
 }
@@ -380,7 +380,7 @@ func (c *Context) SendSegment(s buffer.VectorisedView) {
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: c.BuildSegment(payload, h),
 	})
 }
@@ -389,7 +389,7 @@ func (c *Context) SendPacket(payload []byte, h *Headers) {
 // & TCPheaders) in an IPv4 packet via the link layer endpoint using the
 // provided source and destination IPv4 addresses.
 func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
 		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
 	})
 }
@@ -564,7 +564,7 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
 		Data: buf.ToVectorisedView(),
 	})
 }
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 756ab913a..df5efbf6a 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,6 +15,7 @@
 package udp
 
 import (
+	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -102,10 +103,13 @@ type endpoint struct {
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
 	multicastLoop  bool
-	reusePort      bool
+	portFlags      ports.Flags
 	bindToDevice   tcpip.NICID
 	broadcast      bool
 
+	lastErrorMu sync.Mutex   `state:"nosave"`
+	lastError   *tcpip.Error `state:".(string)"`
+
 	// Values used to reserve a port or register a transport endpoint.
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
@@ -188,6 +192,15 @@ func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
 }
 
+func (e *endpoint) takeLastError() *tcpip.Error {
+	e.lastErrorMu.Lock()
+	defer e.lastErrorMu.Unlock()
+
+	err := e.lastError
+	e.lastError = nil
+	return err
+}
+
 // Abort implements stack.TransportEndpoint.Abort.
 func (e *endpoint) Abort() {
 	e.Close()
@@ -201,7 +214,7 @@ func (e *endpoint) Close() {
 
 	switch e.state {
 	case StateBound, StateConnected:
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
 		e.boundBindToDevice = 0
 		e.boundPortFlags = ports.Flags{}
@@ -235,14 +248,13 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
 	e.rcvMu.Lock()
 
 	if e.rcvList.Empty() {
@@ -382,6 +394,10 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return 0, nil, err
+	}
+
 	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
 	if opts.More {
 		return 0, nil, tcpip.ErrInvalidOptionValue
@@ -410,24 +426,33 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	}
 
 	var route *stack.Route
+	var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error)
 	var dstPort uint16
 	if to == nil {
 		route = &e.route
 		dstPort = e.dstPort
-
-		if route.IsResolutionRequired() {
-			// Promote lock to exclusive if using a shared route, given that it may need to
-			// change in Route.Resolve() call below.
+		resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
 			e.mu.RUnlock()
-			defer e.mu.RLock()
-
 			e.mu.Lock()
-			defer e.mu.Unlock()
 
 			// Recheck state after lock was re-acquired.
 			if e.state != StateConnected {
-				return 0, nil, tcpip.ErrInvalidEndpointState
+				err = tcpip.ErrInvalidEndpointState
+			}
+			if err == nil && route.IsResolutionRequired() {
+				ch, err = route.Resolve(waker)
 			}
+
+			e.mu.Unlock()
+			e.mu.RLock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != StateConnected {
+				err = tcpip.ErrInvalidEndpointState
+			}
+			return
 		}
 	} else {
 		// Reject destination address if it goes through a different
@@ -458,10 +483,11 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 		route = &r
 		dstPort = dst.Port
+		resolve = route.Resolve
 	}
 
 	if route.IsResolutionRequired() {
-		if ch, err := route.Resolve(nil); err != nil {
+		if ch, err := resolve(nil); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				return 0, ch, tcpip.ErrNoLinkAddress
 			}
@@ -532,10 +558,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 
 	case tcpip.ReuseAddressOption:
+		e.mu.Lock()
+		e.portFlags.MostRecent = v
+		e.mu.Unlock()
 
 	case tcpip.ReusePortOption:
 		e.mu.Lock()
-		e.reusePort = v
+		e.portFlags.LoadBalanced = v
 		e.mu.Unlock()
 
 	case tcpip.V6OnlyOption:
@@ -769,11 +798,15 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		return v, nil
 
 	case tcpip.ReuseAddressOption:
-		return false, nil
+		e.mu.RLock()
+		v := e.portFlags.MostRecent
+		e.mu.RUnlock()
+
+		return v, nil
 
 	case tcpip.ReusePortOption:
 		e.mu.RLock()
-		v := e.reusePort
+		v := e.portFlags.LoadBalanced
 		e.mu.RUnlock()
 
 		return v, nil
@@ -853,6 +886,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
 	case tcpip.ErrorOption:
+		return e.takeLastError()
 	case *tcpip.MulticastInterfaceOption:
 		e.mu.Lock()
 		*o = tcpip.MulticastInterfaceOption{
@@ -900,7 +934,11 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, stack.PacketBuffer{
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: ProtocolNumber,
+		TTL:      ttl,
+		TOS:      tos,
+	}, &stack.PacketBuffer{
 		Header:          hdr,
 		Data:            data,
 		TransportHeader: buffer.View(udp),
@@ -937,6 +975,11 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 		id  stack.TransportEndpointID
 		btd tcpip.NICID
 	)
+
+	// We change this value below and we need the old value to unregister
+	// the endpoint.
+	boundPortFlags := e.boundPortFlags
+
 	// Exclude ephemerally bound endpoints.
 	if e.BindNICID != 0 || e.ID.LocalAddress == "" {
 		var err *tcpip.Error
@@ -949,16 +992,17 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 			return err
 		}
 		e.state = StateBound
+		boundPortFlags = e.boundPortFlags
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice)
 			e.boundPortFlags = ports.Flags{}
 		}
 		e.state = StateInitial
 	}
 
-	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
 	e.ID = id
 	e.boundBindToDevice = btd
 	e.route.Release()
@@ -1030,6 +1074,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		}
 	}
 
+	oldPortFlags := e.boundPortFlags
+
 	id, btd, err := e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
@@ -1037,7 +1083,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Remove the old registration.
 	if e.ID.LocalPort != 0 {
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice)
 	}
 
 	e.ID = id
@@ -1101,20 +1147,15 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		flags := ports.Flags{
-			LoadBalanced: e.reusePort,
-			// FIXME(b/129164367): Support SO_REUSEADDR.
-			MostRecent: false,
-		}
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice)
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
-		e.boundPortFlags = flags
 		id.LocalPort = port
 	}
+	e.boundPortFlags = e.portFlags
 
-	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
+	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.boundPortFlags, e.bindToDevice)
 	if err != nil {
 		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
 		e.boundPortFlags = ports.Flags{}
@@ -1248,18 +1289,16 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok || int(header.UDP(hdr).Length()) > pkt.Data.Size() {
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	pkt.Data.TrimFront(header.UDPMinimumSize)
-
 	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
@@ -1315,7 +1354,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	if typ == stack.ControlPortUnreachable {
+		e.mu.RLock()
+		defer e.mu.RUnlock()
+
+		if e.state == StateConnected {
+			e.lastErrorMu.Lock()
+			defer e.lastErrorMu.Unlock()
+
+			e.lastError = tcpip.ErrConnectionRefused
+		}
+	}
 }
 
 // State implements tcpip.Endpoint.State.
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 466bd9381..851e6b635 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -37,6 +37,24 @@ func (u *udpPacket) loadData(data buffer.VectorisedView) {
 	u.data = data
 }
 
+// saveLastError is invoked by stateify.
+func (e *endpoint) saveLastError() string {
+	if e.lastError == nil {
+		return ""
+	}
+
+	return e.lastError.String()
+}
+
+// loadLastError is invoked by stateify.
+func (e *endpoint) loadLastError(s string) {
+	if s == "" {
+		return
+	}
+
+	e.lastError = tcpip.StringToError(s)
+}
+
 // beforeSave is invoked by stateify.
 func (e *endpoint) beforeSave() {
 	// Stop incoming packets from being handled (and mutate endpoint state).
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index a674ceb68..c67e0ba95 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -43,7 +43,7 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
@@ -61,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	pkt   stack.PacketBuffer
+	pkt   *stack.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
@@ -73,7 +73,7 @@ func (r *ForwarderRequest) ID() stack.TransportEndpointID {
 // CreateEndpoint creates a connected UDP endpoint for the session request.
 func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	ep := newEndpoint(r.stack, r.route.NetProto, queue)
-	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.reusePort, ep.bindToDevice); err != nil {
+	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.portFlags, ep.bindToDevice); err != nil {
 		ep.Close()
 		return nil, err
 	}
@@ -82,6 +82,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.route = r.route.Clone()
 	ep.dstPort = r.id.RemotePort
 	ep.RegisterNICID = r.route.NICID()
+	ep.boundPortFlags = ep.portFlags
 
 	ep.state = StateConnected
 
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 52af6de22..4218e7d03 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -66,15 +66,9 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
-	// Get the header then trim it from the view.
-	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok {
-		// Malformed packet.
-		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
-		return true
-	}
-	if int(header.UDP(h).Length()) > pkt.Data.Size() {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
@@ -121,7 +115,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
@@ -130,9 +124,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		// For example, a raw or packet socket may use what UDP
 		// considers an unreachable destination. Thus we deep copy pkt
 		// to prevent multiple ownership and SR errors.
-		newNetHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-		payload := newNetHeader.ToVectorisedView()
-		payload.Append(pkt.Data.ToView().ToVectorisedView())
+		newHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+		newHeader = append(newHeader, pkt.TransportHeader...)
+		payload := newHeader.ToVectorisedView()
+		payload.AppendView(pkt.Data.ToView())
 		payload.CapLength(payloadLen)
 
 		hdr := buffer.NewPrependable(headerLen)
@@ -140,9 +135,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv4DstUnreachable)
 		pkt.SetCode(header.ICMPv4PortUnreachable)
 		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
 		})
 
 	case header.IPv6AddressSize:
@@ -164,11 +160,11 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
-		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader), []buffer.View{pkt.NetworkHeader})
+		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader)+len(pkt.TransportHeader), []buffer.View{pkt.NetworkHeader, pkt.TransportHeader})
 		payload.Append(pkt.Data)
 		payload.CapLength(payloadLen)
 
@@ -177,9 +173,10 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		pkt.SetType(header.ICMPv6DstUnreachable)
 		pkt.SetCode(header.ICMPv6PortUnreachable)
 		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
 		})
 	}
 	return true
@@ -201,6 +198,18 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok {
+		// Packet is too small
+		return false
+	}
+	pkt.TransportHeader = h
+	pkt.Data.TrimFront(header.UDPMinimumSize)
+	return true
+}
+
 // NewProtocol returns a UDP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 8acaa607a..313a3f117 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -440,10 +440,8 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
 	})
 }
 
@@ -487,10 +485,8 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 
 	// Inject packet.
 
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
 	})
 }
 
@@ -1720,6 +1716,58 @@ func TestIncrementMalformedPacketsReceived(t *testing.T) {
 	}
 }
 
+// TestShortHeader verifies that when a packet with a too-short UDP header is
+// received, the malformed received global stat gets incremented.
+func TestShortHeader(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	c.t.Helper()
+	h := unicastV6.header4Tuple(incoming)
+
+	// Allocate a buffer for an IPv6 and too-short UDP header.
+	const udpSize = header.UDPMinimumSize - 1
+	buf := buffer.NewView(header.IPv6MinimumSize + udpSize)
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
+		PayloadLength: uint16(udpSize),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       h.srcAddr.Addr,
+		DstAddr:       h.dstAddr.Addr,
+	})
+
+	// Initialize the UDP header.
+	udpHdr := header.UDP(buffer.NewView(header.UDPMinimumSize))
+	udpHdr.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  header.UDPMinimumSize,
+	})
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(udpHdr)))
+	udpHdr.SetChecksum(^udpHdr.CalculateChecksum(xsum))
+	// Copy all but the last byte of the UDP header into the packet.
+	copy(buf[header.IPv6MinimumSize:], udpHdr)
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	if got, want := c.s.Stats().MalformedRcvdPackets.Value(), uint64(1); got != want {
+		t.Errorf("got c.s.Stats().MalformedRcvdPackets.Value() = %d, want = %d", got, want)
+	}
+}
+
 // TestShutdownRead verifies endpoint read shutdown and error
 // stats increment on packet receive.
 func TestShutdownRead(t *testing.T) {
diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go
index 5f2af9f3b..c45d2ecbc 100644
--- a/pkg/test/dockerutil/dockerutil.go
+++ b/pkg/test/dockerutil/dockerutil.go
@@ -148,6 +148,62 @@ func (m MountMode) String() string {
 	panic(fmt.Sprintf("invalid mode: %d", m))
 }
 
+// DockerNetwork contains the name of a docker network.
+type DockerNetwork struct {
+	logger     testutil.Logger
+	Name       string
+	Subnet     *net.IPNet
+	containers []*Docker
+}
+
+// NewDockerNetwork sets up the struct for a Docker network. Names of networks
+// will be unique.
+func NewDockerNetwork(logger testutil.Logger) *DockerNetwork {
+	return &DockerNetwork{
+		logger: logger,
+		Name:   testutil.RandomID(logger.Name()),
+	}
+}
+
+// Create calls 'docker network create'.
+func (n *DockerNetwork) Create(args ...string) error {
+	a := []string{"docker", "network", "create"}
+	if n.Subnet != nil {
+		a = append(a, fmt.Sprintf("--subnet=%s", n.Subnet))
+	}
+	a = append(a, args...)
+	a = append(a, n.Name)
+	return testutil.Command(n.logger, a...).Run()
+}
+
+// Connect calls 'docker network connect' with the arguments provided.
+func (n *DockerNetwork) Connect(container *Docker, args ...string) error {
+	a := []string{"docker", "network", "connect"}
+	a = append(a, args...)
+	a = append(a, n.Name, container.Name)
+	if err := testutil.Command(n.logger, a...).Run(); err != nil {
+		return err
+	}
+	n.containers = append(n.containers, container)
+	return nil
+}
+
+// Cleanup cleans up the docker network and all the containers attached to it.
+func (n *DockerNetwork) Cleanup() error {
+	for _, c := range n.containers {
+		// Don't propagate the error, it might be that the container
+		// was already cleaned up.
+		if err := c.Kill(); err != nil {
+			n.logger.Logf("unable to kill container during cleanup: %s", err)
+		}
+	}
+
+	if err := testutil.Command(n.logger, "docker", "network", "rm", n.Name).Run(); err != nil {
+		return err
+	}
+	return nil
+}
+
 // Docker contains the name and the runtime of a docker container.
 type Docker struct {
 	logger   testutil.Logger
@@ -162,9 +218,13 @@ type Docker struct {
 //
 // Names of containers will be unique.
 func MakeDocker(logger testutil.Logger) *Docker {
+	// Slashes are not allowed in container names.
+	name := testutil.RandomID(logger.Name())
+	name = strings.ReplaceAll(name, "/", "-")
+
 	return &Docker{
 		logger:  logger,
-		Name:    testutil.RandomID(logger.Name()),
+		Name:    name,
 		Runtime: *runtime,
 	}
 }
@@ -309,7 +369,9 @@ func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) {
 		rv = append(rv, d.Name)
 	} else {
 		rv = append(rv, d.mounts...)
-		rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+		if len(d.Runtime) > 0 {
+			rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime))
+		}
 		rv = append(rv, fmt.Sprintf("--name=%s", d.Name))
 		rv = append(rv, testutil.ImageByName(r.Image))
 	}
@@ -477,6 +539,56 @@ func (d *Docker) FindIP() (net.IP, error) {
 	return ip, nil
 }
 
+// A NetworkInterface is container's network interface information.
+type NetworkInterface struct {
+	IPv4 net.IP
+	MAC  net.HardwareAddr
+}
+
+// ListNetworks returns the network interfaces of the container, keyed by
+// Docker network name.
+func (d *Docker) ListNetworks() (map[string]NetworkInterface, error) {
+	const format = `{{json .NetworkSettings.Networks}}`
+	out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("error network interfaces: %q: %w", string(out), err)
+	}
+
+	networks := map[string]map[string]string{}
+	if err := json.Unmarshal(out, &networks); err != nil {
+		return nil, fmt.Errorf("error decoding network interfaces: %w", err)
+	}
+
+	interfaces := map[string]NetworkInterface{}
+	for name, iface := range networks {
+		var netface NetworkInterface
+
+		rawIP := strings.TrimSpace(iface["IPAddress"])
+		if rawIP != "" {
+			ip := net.ParseIP(rawIP)
+			if ip == nil {
+				return nil, fmt.Errorf("invalid IP: %q", rawIP)
+			}
+			// Docker's IPAddress field is IPv4. The IPv6 address
+			// is stored in the GlobalIPv6Address field.
+			netface.IPv4 = ip
+		}
+
+		rawMAC := strings.TrimSpace(iface["MacAddress"])
+		if rawMAC != "" {
+			mac, err := net.ParseMAC(rawMAC)
+			if err != nil {
+				return nil, fmt.Errorf("invalid MAC: %q: %w", rawMAC, err)
+			}
+			netface.MAC = mac
+		}
+
+		interfaces[name] = netface
+	}
+
+	return interfaces, nil
+}
+
 // SandboxPid returns the PID to the sandbox process.
 func (d *Docker) SandboxPid() (int, error) {
 	out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput()
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
deleted file mode 100644
index 2dcba84ae..000000000
--- a/pkg/tmutex/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
-    name = "tmutex",
-    srcs = ["tmutex.go"],
-    visibility = ["//:sandbox"],
-)
-
-go_test(
-    name = "tmutex_test",
-    size = "medium",
-    srcs = ["tmutex_test.go"],
-    library = ":tmutex",
-    deps = ["//pkg/sync"],
-)
diff --git a/pkg/tmutex/tmutex.go b/pkg/tmutex/tmutex.go
deleted file mode 100644
index c4685020d..000000000
--- a/pkg/tmutex/tmutex.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package tmutex provides the implementation of a mutex that implements an
-// efficient TryLock function in addition to Lock and Unlock.
-package tmutex
-
-import (
-	"sync/atomic"
-)
-
-// Mutex is a mutual exclusion primitive that implements TryLock in addition
-// to Lock and Unlock.
-type Mutex struct {
-	v  int32
-	ch chan struct{}
-}
-
-// Init initializes the mutex.
-func (m *Mutex) Init() {
-	m.v = 1
-	m.ch = make(chan struct{}, 1)
-}
-
-// Lock acquires the mutex. If it is currently held by another goroutine, Lock
-// will wait until it has a chance to acquire it.
-func (m *Mutex) Lock() {
-	// Uncontended case.
-	if atomic.AddInt32(&m.v, -1) == 0 {
-		return
-	}
-
-	for {
-		// Try to acquire the mutex again, at the same time making sure
-		// that m.v is negative, which indicates to the owner of the
-		// lock that it is contended, which will force it to try to wake
-		// someone up when it releases the mutex.
-		if v := atomic.LoadInt32(&m.v); v >= 0 && atomic.SwapInt32(&m.v, -1) == 1 {
-			return
-		}
-
-		// Wait for the mutex to be released before trying again.
-		<-m.ch
-	}
-}
-
-// TryLock attempts to acquire the mutex without blocking. If the mutex is
-// currently held by another goroutine, it fails to acquire it and returns
-// false.
-func (m *Mutex) TryLock() bool {
-	v := atomic.LoadInt32(&m.v)
-	if v <= 0 {
-		return false
-	}
-	return atomic.CompareAndSwapInt32(&m.v, 1, 0)
-}
-
-// Unlock releases the mutex.
-func (m *Mutex) Unlock() {
-	if atomic.SwapInt32(&m.v, 1) == 0 {
-		// There were no pending waiters.
-		return
-	}
-
-	// Wake some waiter up.
-	select {
-	case m.ch <- struct{}{}:
-	default:
-	}
-}
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
deleted file mode 100644
index 05540696a..000000000
--- a/pkg/tmutex/tmutex_test.go
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tmutex
-
-import (
-	"fmt"
-	"runtime"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"gvisor.dev/gvisor/pkg/sync"
-)
-
-func TestBasicLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	m.Lock()
-
-	// Try blocking lock the mutex from a different goroutine. This must
-	// not block because the mutex is held.
-	ch := make(chan struct{}, 1)
-	go func() {
-		m.Lock()
-		ch <- struct{}{}
-		m.Unlock()
-		ch <- struct{}{}
-	}()
-
-	select {
-	case <-ch:
-		t.Fatalf("Lock succeeded on locked mutex")
-	case <-time.After(100 * time.Millisecond):
-	}
-
-	// Unlock the mutex and make sure that the goroutine waiting on Lock()
-	// unblocks and succeeds.
-	m.Unlock()
-
-	select {
-	case <-ch:
-	case <-time.After(100 * time.Millisecond):
-		t.Fatalf("Lock failed to acquire unlocked mutex")
-	}
-
-	// Make sure we can lock and unlock again.
-	m.Lock()
-	m.Unlock()
-}
-
-func TestTryLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Try to lock. It should succeed.
-	if !m.TryLock() {
-		t.Fatalf("TryLock failed on unlocked mutex")
-	}
-
-	// Try to lock again, it should now fail.
-	if m.TryLock() {
-		t.Fatalf("TryLock succeeded on locked mutex")
-	}
-
-	// Try blocking lock the mutex from a different goroutine. This must
-	// not block because the mutex is held.
-	ch := make(chan struct{}, 1)
-	go func() {
-		m.Lock()
-		ch <- struct{}{}
-		m.Unlock()
-	}()
-
-	select {
-	case <-ch:
-		t.Fatalf("Lock succeeded on locked mutex")
-	case <-time.After(100 * time.Millisecond):
-	}
-
-	// Unlock the mutex and make sure that the goroutine waiting on Lock()
-	// unblocks and succeeds.
-	m.Unlock()
-
-	select {
-	case <-ch:
-	case <-time.After(100 * time.Millisecond):
-		t.Fatalf("Lock failed to acquire unlocked mutex")
-	}
-}
-
-func TestMutualExclusion(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Test mutual exclusion by running "gr" goroutines concurrently, and
-	// have each one increment a counter "iters" times within the critical
-	// section established by the mutex.
-	//
-	// If at the end the counter is not gr * iters, then we know that
-	// goroutines ran concurrently within the critical section.
-	//
-	// If one of the goroutines doesn't complete, it's likely a bug that
-	// causes to it to wait forever.
-	const gr = 1000
-	const iters = 100000
-	v := 0
-	var wg sync.WaitGroup
-	for i := 0; i < gr; i++ {
-		wg.Add(1)
-		go func() {
-			for j := 0; j < iters; j++ {
-				m.Lock()
-				v++
-				m.Unlock()
-			}
-			wg.Done()
-		}()
-	}
-
-	wg.Wait()
-
-	if v != gr*iters {
-		t.Fatalf("Bad count: got %v, want %v", v, gr*iters)
-	}
-}
-
-func TestMutualExclusionWithTryLock(t *testing.T) {
-	var m Mutex
-	m.Init()
-
-	// Similar to the previous, with the addition of some goroutines that
-	// only increment the count if TryLock succeeds.
-	const gr = 1000
-	const iters = 100000
-	total := int64(gr * iters)
-	var tryTotal int64
-	v := int64(0)
-	var wg sync.WaitGroup
-	for i := 0; i < gr; i++ {
-		wg.Add(2)
-		go func() {
-			for j := 0; j < iters; j++ {
-				m.Lock()
-				v++
-				m.Unlock()
-			}
-			wg.Done()
-		}()
-		go func() {
-			local := int64(0)
-			for j := 0; j < iters; j++ {
-				if m.TryLock() {
-					v++
-					m.Unlock()
-					local++
-				}
-			}
-			atomic.AddInt64(&tryTotal, local)
-			wg.Done()
-		}()
-	}
-
-	wg.Wait()
-
-	t.Logf("tryTotal = %d", tryTotal)
-	total += tryTotal
-
-	if v != total {
-		t.Fatalf("Bad count: got %v, want %v", v, total)
-	}
-}
-
-// BenchmarkTmutex is equivalent to TestMutualExclusion, with the following
-// differences:
-//
-// - The number of goroutines is variable, with the maximum value depending on
-// GOMAXPROCS.
-//
-// - The number of iterations per benchmark is controlled by the benchmarking
-// framework.
-//
-// - Care is taken to ensure that all goroutines participating in the benchmark
-// have been created before the benchmark begins.
-func BenchmarkTmutex(b *testing.B) {
-	for n, max := 1, 4*runtime.GOMAXPROCS(0); n > 0 && n <= max; n *= 2 {
-		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
-			var m Mutex
-			m.Init()
-
-			var ready sync.WaitGroup
-			begin := make(chan struct{})
-			var end sync.WaitGroup
-			for i := 0; i < n; i++ {
-				ready.Add(1)
-				end.Add(1)
-				go func() {
-					ready.Done()
-					<-begin
-					for j := 0; j < b.N; j++ {
-						m.Lock()
-						m.Unlock()
-					}
-					end.Done()
-				}()
-			}
-
-			ready.Wait()
-			b.ResetTimer()
-			close(begin)
-			end.Wait()
-		})
-	}
-}
-
-// BenchmarkSyncMutex is equivalent to BenchmarkTmutex, but uses sync.Mutex as
-// a comparison point.
-func BenchmarkSyncMutex(b *testing.B) {
-	for n, max := 1, 4*runtime.GOMAXPROCS(0); n > 0 && n <= max; n *= 2 {
-		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
-			var m sync.Mutex
-
-			var ready sync.WaitGroup
-			begin := make(chan struct{})
-			var end sync.WaitGroup
-			for i := 0; i < n; i++ {
-				ready.Add(1)
-				end.Add(1)
-				go func() {
-					ready.Done()
-					<-begin
-					for j := 0; j < b.N; j++ {
-						m.Lock()
-						m.Unlock()
-					}
-					end.Done()
-				}()
-			}
-
-			ready.Wait()
-			b.ResetTimer()
-			close(begin)
-			end.Wait()
-		})
-	}
-}
diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go
index e79210804..c4100481e 100644
--- a/pkg/usermem/addr.go
+++ b/pkg/usermem/addr.go
@@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool {
 func (ar AddrRange) String() string {
 	return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
 }
+
+// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the
+// potentially truncating conversion from uint64 to Addr. This is necessary
+// because there is no way to define generic "PageRoundDown/Up" functions in Go.
+
+// PageRoundDown returns x rounded down to the nearest page boundary.
+func PageRoundDown(x uint64) uint64 {
+	return x &^ (PageSize - 1)
+}
+
+// PageRoundUp returns x rounded up to the nearest page boundary.
+// ok is true iff rounding up did not wrap around.
+func PageRoundUp(x uint64) (addr uint64, ok bool) {
+	addr = PageRoundDown(x + PageSize - 1)
+	ok = addr >= x
+	return
+}
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index b7cfb35bf..84c67cbc2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -119,7 +119,13 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
 	}
 
 	if tr.shouldReport(regs) {
-		c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
+		name := c.nameMap.Name(uintptr(sysnr))
+		c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+
+			"likely that you can safely ignore this message and that this is not "+
+			"the cause of any error. Please, refer to %s/%s for more information.",
+			name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs),
+			argVal(4, regs), argVal(5, regs), syscallLink, name)
+
 		tr.onReported(regs)
 	}
 }
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 42b0ca8b0..8eb76b2ba 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -24,8 +24,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
-// reportLimit is the max number of events that should be reported per tracker.
-const reportLimit = 100
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/amd64"
+)
 
 // newRegs create a empty Registers instance.
 func newRegs() *rpb.Registers {
@@ -36,22 +40,22 @@ func newRegs() *rpb.Registers {
 	}
 }
 
-func argVal(argIdx int, regs *rpb.Registers) uint32 {
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
 	amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
 
 	switch argIdx {
 	case 0:
-		return uint32(amd64Regs.Rdi)
+		return amd64Regs.Rdi
 	case 1:
-		return uint32(amd64Regs.Rsi)
+		return amd64Regs.Rsi
 	case 2:
-		return uint32(amd64Regs.Rdx)
+		return amd64Regs.Rdx
 	case 3:
-		return uint32(amd64Regs.R10)
+		return amd64Regs.R10
 	case 4:
-		return uint32(amd64Regs.R8)
+		return amd64Regs.R8
 	case 5:
-		return uint32(amd64Regs.R9)
+		return amd64Regs.R9
 	}
 	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
index f784cd237..bce9d95b3 100644
--- a/runsc/boot/compat_arm64.go
+++ b/runsc/boot/compat_arm64.go
@@ -23,8 +23,12 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/strace"
 )
 
-// reportLimit is the max number of events that should be reported per tracker.
-const reportLimit = 100
+const (
+	// reportLimit is the max number of events that should be reported per
+	// tracker.
+	reportLimit = 100
+	syscallLink = "https://gvisor.dev/c/linux/arm64"
+)
 
 // newRegs create a empty Registers instance.
 func newRegs() *rpb.Registers {
@@ -35,22 +39,22 @@ func newRegs() *rpb.Registers {
 	}
 }
 
-func argVal(argIdx int, regs *rpb.Registers) uint32 {
+func argVal(argIdx int, regs *rpb.Registers) uint64 {
 	arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
 
 	switch argIdx {
 	case 0:
-		return uint32(arm64Regs.R0)
+		return arm64Regs.R0
 	case 1:
-		return uint32(arm64Regs.R1)
+		return arm64Regs.R1
 	case 2:
-		return uint32(arm64Regs.R2)
+		return arm64Regs.R2
 	case 3:
-		return uint32(arm64Regs.R3)
+		return arm64Regs.R3
 	case 4:
-		return uint32(arm64Regs.R4)
+		return arm64Regs.R4
 	case 5:
-		return uint32(arm64Regs.R5)
+		return arm64Regs.R5
 	}
 	panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
 }
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 98cdd90dd..60e33425f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -288,6 +288,14 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_SIGALTSTACK:     {},
 	unix.SYS_STATX:              {},
 	syscall.SYS_SYNC_FILE_RANGE: {},
+	syscall.SYS_TEE: []seccomp.Rule{
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+			seccomp.AllowValue(1),                      /* len */
+			seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+		},
+	},
 	syscall.SYS_TGKILL: []seccomp.Rule{
 		{
 			seccomp.AllowValue(uint64(os.Getpid())),
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 5e5a3c998..209e646a7 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -26,6 +26,8 @@ import (
 func instrumentationFilters() seccomp.SyscallRules {
 	Report("MSAN is enabled: syscall filters less restrictive!")
 	return seccomp.SyscallRules{
+		syscall.SYS_CLONE:             {},
+		syscall.SYS_MMAP:              {},
 		syscall.SYS_SCHED_GETAFFINITY: {},
 		syscall.SYS_SET_ROBUST_LIST:   {},
 	}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e1181271a..e83584b82 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -37,6 +37,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
 	gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
@@ -62,7 +63,7 @@ const (
 )
 
 // tmpfs has some extra supported options that we must pass through.
-var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+var tmpfsAllowedData = []string{"mode", "uid", "gid"}
 
 func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
@@ -153,8 +154,8 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// p9MountOptions creates a slice of options for a p9 mount.
-func p9MountOptions(fd int, fa FileAccessType, vfs2 bool) []string {
+// p9MountData creates a slice of p9 mount data.
+func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
 	opts := []string{
 		"trans=fd",
 		"rfdno=" + strconv.Itoa(fd),
@@ -221,9 +222,6 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 			mf.NoAtime = true
 		case "noexec":
 			mf.NoExec = true
-		case "bind", "rbind":
-			// When options include either "bind" or "rbind",
-			// it's converted to a 9P mount.
 		default:
 			log.Warningf("ignoring unknown mount option %q", o)
 		}
@@ -237,7 +235,7 @@ func isSupportedMountFlag(fstype, opt string) bool {
 		return true
 	}
 	if fstype == tmpfsvfs2.Name {
-		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+		ok, err := parseMountOption(opt, tmpfsAllowedData...)
 		return ok && err == nil
 	}
 	return false
@@ -294,19 +292,12 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter,
 	// Set namespace here so that it can be found in ctx.
 	procArgs.MountNamespace = mns
 
-	return setExecutablePath(ctx, procArgs)
-}
-
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
 	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+		return err
 	}
-	procArgs.Filename = f
+	procArgs.Filename = resolved
 	return nil
 }
 
@@ -725,7 +716,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
 	fd := c.fds.remove()
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
@@ -770,10 +761,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		useOverlay bool
 	)
 
-	if isBindMount(m) {
-		m.Type = bind
-	}
-
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
 		fsName = m.Type
@@ -783,7 +770,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		fsName = m.Type
 
 		var err error
-		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
 			return "", nil, false, err
 		}
@@ -791,7 +778,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	case bind:
 		fd := c.fds.remove()
 		fsName = gofervfs2.Name
-		opts = p9MountOptions(fd, c.getMountAccessType(m), conf.VFS2)
+		opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
 		// If configured, add overlay to all writable mounts.
 		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
 
@@ -801,18 +788,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, nil
 }
 
-func isBindMount(m specs.Mount) bool {
-	for _, opt := range m.Options {
-		// When options include either "bind" or "rbind", this behaves as
-		// bind mount even if the mount type is equal to a filesystem supported
-		// on runsc.
-		if opt == "bind" || opt == "rbind" {
-			return true
-		}
-	}
-	return false
-}
-
 func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
 	if hint := c.hints.findMount(mount); hint != nil {
 		return hint.fileAccessType()
@@ -956,7 +931,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
 
 	// Add root mount.
 	fd := c.fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
+	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
 
 	mf := fs.MountSourceFlags{}
 	if c.root.Readonly || conf.Overlay {
@@ -1044,7 +1019,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
 			Destination: "/tmp",
 			// Sticky bit is added to prevent accidental deletion of files from
 			// another user. This is normally done for /tmp.
-			Options: []string{"mode=1777"},
+			Options: []string{"mode=01777"},
 		}
 		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f802bc9fb..002479612 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -1056,7 +1056,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
 		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
 	}
 
-	s.FillDefaultIPTables()
+	s.FillIPTablesMetadata()
 
 	return &s, nil
 }
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 147c901c4..8eeb43e79 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -22,22 +22,21 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
@@ -95,69 +94,14 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
 	procArgs.MountNamespaceVFS2 = mns
-	return setExecutablePathVFS2(ctx, procArgs)
-}
-
-func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
-	exe := procArgs.Argv[0]
-
-	// Absolute paths can be used directly.
-	if path.IsAbs(exe) {
-		procArgs.Filename = exe
-		return nil
-	}
-
-	// Paths with '/' in them should be joined to the working directory, or
-	// to the root if working directory is not set.
-	if strings.IndexByte(exe, '/') > 0 {
-		if !path.IsAbs(procArgs.WorkingDirectory) {
-			return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
-		}
-		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
-		return nil
-	}
-
-	// Paths with a '/' are relative to the CWD.
-	if strings.IndexByte(exe, '/') > 0 {
-		procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
-		return nil
-	}
 
-	// Otherwise, We must lookup the name in the paths, starting from the
-	// root directory.
-	root := procArgs.MountNamespaceVFS2.Root()
-	defer root.DecRef()
-
-	paths := fs.GetPath(procArgs.Envv)
-	creds := procArgs.Credentials
-
-	for _, p := range paths {
-		binPath := path.Join(p, exe)
-		pop := &vfs.PathOperation{
-			Root:               root,
-			Start:              root,
-			Path:               fspath.Parse(binPath),
-			FollowFinalSymlink: true,
-		}
-		opts := &vfs.OpenOptions{
-			FileExec: true,
-			Flags:    linux.O_RDONLY,
-		}
-		dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
-		if err == syserror.ENOENT || err == syserror.EACCES {
-			// Didn't find it here.
-			continue
-		}
-		if err != nil {
-			return err
-		}
-		dentry.DecRef()
-
-		procArgs.Filename = binPath
-		return nil
+	// Resolve the executable path from working dir and environment.
+	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
+	if err != nil {
+		return err
 	}
-
-	return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+	procArgs.Filename = resolved
+	return nil
 }
 
 func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
@@ -192,7 +136,7 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
 
 func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := strings.Join(p9MountOptions(fd, conf.FileAccess, true /* vfs2 */), ",")
+	opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
 	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
@@ -216,8 +160,9 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
 		}
 	}
 
-	// TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
-
+	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
+		return fmt.Errorf(`mount submount "\tmp": %w`, err)
+	}
 	return nil
 }
 
@@ -235,7 +180,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 		fd := -1
 		// Only bind mounts use host FDs; see
 		// containerMounter.getMountNameAndOptionsVFS2.
-		if m.Type == bind || isBindMount(m) {
+		if m.Type == bind {
 			fd = c.fds.remove()
 		}
 		mounts = append(mounts, mountAndFD{
@@ -255,8 +200,6 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
 	return mounts, nil
 }
 
-// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1
-// version.
 func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
 	root := mns.Root()
 	defer root.DecRef()
@@ -265,12 +208,11 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 		Start: root,
 		Path:  fspath.Parse(submount.Destination),
 	}
-
-	fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
+	fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
 	if err != nil {
 		return fmt.Errorf("mountOptions failed: %w", err)
 	}
-	if fsName == "" {
+	if len(fsName) == 0 {
 		// Filesystem is not supported (e.g. cgroup), just skip it.
 		return nil
 	}
@@ -278,17 +220,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 	if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
 		return err
 	}
-
-	opts := &vfs.MountOptions{
-		GetFilesystemOptions: vfs.GetFilesystemOptions{
-			Data: strings.Join(options, ","),
-		},
-		InternalMount: true,
-	}
-
-	// All writes go to upper, be paranoid and make lower readonly.
-	opts.ReadOnly = useOverlay
-
 	if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
 		return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
 	}
@@ -298,17 +229,13 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
 
 // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
 // used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
 	var (
-		fsName     string
-		opts       []string
-		useOverlay bool
+		fsName string
+		data   []string
 	)
 
-	if isBindMount(m.Mount) {
-		m.Type = bind
-	}
-
+	// Find filesystem name and FS specific data field.
 	switch m.Type {
 	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
 		fsName = m.Type
@@ -318,21 +245,46 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
 		fsName = m.Type
 
 		var err error
-		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+		data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
 		if err != nil {
-			return "", nil, false, err
+			return "", nil, err
 		}
 
 	case bind:
 		fsName = gofer.Name
-		opts = p9MountOptions(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
-		// If configured, add overlay to all writable mounts.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+		data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
 
 	default:
 		log.Warningf("ignoring unknown filesystem type %q", m.Type)
 	}
-	return fsName, opts, useOverlay, nil
+
+	opts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+
+	for _, o := range m.Options {
+		switch o {
+		case "rw":
+			opts.ReadOnly = false
+		case "ro":
+			opts.ReadOnly = true
+		case "noatime":
+			opts.Flags.NoATime = true
+		case "noexec":
+			opts.Flags.NoExec = true
+		default:
+			log.Warningf("ignoring unknown mount option %q", o)
+		}
+	}
+
+	if conf.Overlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		opts.ReadOnly = true
+	}
+	return fsName, opts, nil
 }
 
 func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
@@ -361,3 +313,63 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
 	}
 	return nil
 }
+
+// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+//   1. /tmp is mounted explicitly: we should not override user's wish
+//   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+	for _, m := range c.mounts {
+		// m.Destination has been cleaned, so it's to use equality here.
+		if m.Destination == "/tmp" {
+			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
+			return nil
+		}
+	}
+
+	root := mns.Root()
+	defer root.DecRef()
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse("/tmp"),
+	}
+	// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
+	statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+	switch err {
+	case nil:
+		// Found '/tmp' in filesystem, check if it's empty.
+		if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
+			// Not a dir?! Leave it be.
+			return nil
+		}
+		if statx.Nlink > 2 {
+			// If more than "." and ".." is found, skip internal tmpfs to prevent
+			// hiding existing files.
+			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
+			return nil
+		}
+		log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+		fallthrough
+
+	case syserror.ENOENT:
+		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
+		// tmpfs.
+		tmpMount := specs.Mount{
+			Type:        tmpfs.Name,
+			Destination: "/tmp",
+			// Sticky bit is added to prevent accidental deletion of files from
+			// another user. This is normally done for /tmp.
+			Options: []string{"mode=01777"},
+		}
+		return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+
+	default:
+		return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+	}
+}
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d4c7bdfbb..7e34a284a 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -7,8 +7,8 @@ go_library(
     srcs = ["cgroup.go"],
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/cleanup",
         "//pkg/log",
-        "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
     ],
@@ -20,4 +20,8 @@ go_test(
     srcs = ["cgroup_test.go"],
     library = ":cgroup",
     tags = ["local"],
+    deps = [
+        "//pkg/test/testutil",
+        "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+    ],
 )
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index fa40ee509..e5cc9d622 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -19,6 +19,7 @@ package cgroup
 import (
 	"bufio"
 	"context"
+	"errors"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -30,29 +31,31 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
 	cgroupRoot = "/sys/fs/cgroup"
 )
 
-var controllers = map[string]controller{
-	"blkio":    &blockIO{},
-	"cpu":      &cpu{},
-	"cpuset":   &cpuSet{},
-	"memory":   &memory{},
-	"net_cls":  &networkClass{},
-	"net_prio": &networkPrio{},
-	"pids":     &pids{},
+var controllers = map[string]config{
+	"blkio":    config{ctrlr: &blockIO{}},
+	"cpu":      config{ctrlr: &cpu{}},
+	"cpuset":   config{ctrlr: &cpuSet{}},
+	"hugetlb":  config{ctrlr: &hugeTLB{}, optional: true},
+	"memory":   config{ctrlr: &memory{}},
+	"net_cls":  config{ctrlr: &networkClass{}},
+	"net_prio": config{ctrlr: &networkPrio{}},
+	"pids":     config{ctrlr: &pids{}},
 
 	// These controllers either don't have anything in the OCI spec or is
 	// irrelevant for a sandbox.
-	"devices":    &noop{},
-	"freezer":    &noop{},
-	"perf_event": &noop{},
-	"systemd":    &noop{},
+	"devices":    config{ctrlr: &noop{}},
+	"freezer":    config{ctrlr: &noop{}},
+	"perf_event": config{ctrlr: &noop{}},
+	"rdma":       config{ctrlr: &noop{}, optional: true},
+	"systemd":    config{ctrlr: &noop{}},
 }
 
 func setOptionalValueInt(path, name string, val *int64) error {
@@ -122,7 +125,7 @@ func fillFromAncestor(path string) (string, error) {
 		return val, nil
 	}
 
-	// File is not set, recurse to parent and then  set here.
+	// File is not set, recurse to parent and then set here.
 	name := filepath.Base(path)
 	parent := filepath.Dir(filepath.Dir(path))
 	val, err = fillFromAncestor(filepath.Join(parent, name))
@@ -196,8 +199,9 @@ func LoadPaths(pid string) (map[string]string, error) {
 	return paths, nil
 }
 
-// Cgroup represents a group inside all controllers. For example: Name='/foo/bar'
-// maps to /sys/fs/cgroup/<controller>/foo/bar on all controllers.
+// Cgroup represents a group inside all controllers. For example:
+//   Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on
+//   all controllers.
 type Cgroup struct {
 	Name    string            `json:"name"`
 	Parents map[string]string `json:"parents"`
@@ -242,16 +246,20 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 
 	// The Cleanup object cleans up partially created cgroups when an error occurs.
 	// Errors occuring during cleanup itself are ignored.
-	clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+	clean := cleanup.Make(func() { _ = c.Uninstall() })
 	defer clean.Clean()
 
-	for key, ctrl := range controllers {
+	for key, cfg := range controllers {
 		path := c.makePath(key)
 		if err := os.MkdirAll(path, 0755); err != nil {
+			if cfg.optional && errors.Is(err, syscall.EROFS) {
+				log.Infof("Skipping cgroup %q", key)
+				continue
+			}
 			return err
 		}
 		if res != nil {
-			if err := ctrl.set(res, path); err != nil {
+			if err := cfg.ctrlr.set(res, path); err != nil {
 				return err
 			}
 		}
@@ -321,10 +329,13 @@ func (c *Cgroup) Join() (func(), error) {
 	}
 
 	// Now join the cgroups.
-	for key := range controllers {
+	for key, cfg := range controllers {
 		path := c.makePath(key)
 		log.Debugf("Joining cgroup %q", path)
 		if err := setValue(path, "cgroup.procs", "0"); err != nil {
+			if cfg.optional && os.IsNotExist(err) {
+				continue
+			}
 			return undo, err
 		}
 	}
@@ -375,6 +386,11 @@ func (c *Cgroup) makePath(controllerName string) string {
 	return filepath.Join(cgroupRoot, controllerName, path)
 }
 
+type config struct {
+	ctrlr    controller
+	optional bool
+}
+
 type controller interface {
 	set(*specs.LinuxResources, string) error
 }
@@ -430,7 +446,13 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error {
 	if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
 		return err
 	}
-	return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+	if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil {
+		return err
+	}
+	if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil {
+		return err
+	}
+	return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime)
 }
 
 type cpuSet struct{}
@@ -471,13 +493,17 @@ func (*blockIO) set(spec *specs.LinuxResources, path string) error {
 	}
 
 	for _, dev := range spec.BlockIO.WeightDevice {
-		val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
-		if err := setValue(path, "blkio.weight_device", val); err != nil {
-			return err
+		if dev.Weight != nil {
+			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
+			if err := setValue(path, "blkio.weight_device", val); err != nil {
+				return err
+			}
 		}
-		val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
-		if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
-			return err
+		if dev.LeafWeight != nil {
+			val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight)
+			if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+				return err
+			}
 		}
 	}
 	if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
@@ -529,9 +555,22 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
 type pids struct{}
 
 func (*pids) set(spec *specs.LinuxResources, path string) error {
-	if spec.Pids == nil {
+	if spec.Pids == nil || spec.Pids.Limit <= 0 {
 		return nil
 	}
 	val := strconv.FormatInt(spec.Pids.Limit, 10)
 	return setValue(path, "pids.max", val)
 }
+
+type hugeTLB struct{}
+
+func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
+	for _, limit := range spec.HugepageLimits {
+		name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
+		val := strconv.FormatUint(limit.Limit, 10)
+		if err := setValue(path, name, val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 548c80e9a..4db5ee5c3 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -15,7 +15,14 @@
 package cgroup
 
 import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
 	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/test/testutil"
 )
 
 func TestUninstallEnoent(t *testing.T) {
@@ -65,3 +72,578 @@ func TestCountCpuset(t *testing.T) {
 		})
 	}
 }
+
+func uint16Ptr(v uint16) *uint16 {
+	return &v
+}
+
+func uint32Ptr(v uint32) *uint32 {
+	return &v
+}
+
+func int64Ptr(v int64) *int64 {
+	return &v
+}
+
+func uint64Ptr(v uint64) *uint64 {
+	return &v
+}
+
+func boolPtr(v bool) *bool {
+	return &v
+}
+
+func checkDir(t *testing.T, dir string, contents map[string]string) {
+	all, err := ioutil.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("ReadDir(%q): %v", dir, err)
+	}
+	fileCount := 0
+	for _, file := range all {
+		if file.IsDir() {
+			// Only want to compare files.
+			continue
+		}
+		fileCount++
+
+		want, ok := contents[file.Name()]
+		if !ok {
+			t.Errorf("file not expected: %q", file.Name())
+			continue
+		}
+		gotBytes, err := ioutil.ReadFile(filepath.Join(dir, file.Name()))
+		if err != nil {
+			t.Fatal(err.Error())
+		}
+		got := strings.TrimSuffix(string(gotBytes), "\n")
+		if got != want {
+			t.Errorf("wrong file content, file: %q, want: %q, got: %q", file.Name(), want, got)
+		}
+	}
+	if fileCount != len(contents) {
+		t.Errorf("file is missing, want: %v, got: %v", contents, all)
+	}
+}
+
+func makeLinuxWeightDevice(major, minor int64, weight, leafWeight *uint16) specs.LinuxWeightDevice {
+	rv := specs.LinuxWeightDevice{
+		Weight:     weight,
+		LeafWeight: leafWeight,
+	}
+	rv.Major = major
+	rv.Minor = minor
+	return rv
+}
+
+func makeLinuxThrottleDevice(major, minor int64, rate uint64) specs.LinuxThrottleDevice {
+	rv := specs.LinuxThrottleDevice{
+		Rate: rate,
+	}
+	rv.Major = major
+	rv.Minor = minor
+	return rv
+}
+
+func TestBlockIO(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxBlockIO
+		wants map[string]string
+	}{
+		{
+			name: "simple",
+			spec: &specs.LinuxBlockIO{
+				Weight:     uint16Ptr(1),
+				LeafWeight: uint16Ptr(2),
+			},
+			wants: map[string]string{
+				"blkio.weight":      "1",
+				"blkio.leaf_weight": "2",
+			},
+		},
+		{
+			name: "weight_device",
+			spec: &specs.LinuxBlockIO{
+				WeightDevice: []specs.LinuxWeightDevice{
+					makeLinuxWeightDevice(1, 2, uint16Ptr(3), uint16Ptr(4)),
+				},
+			},
+			wants: map[string]string{
+				"blkio.weight_device":      "1:2 3",
+				"blkio.leaf_weight_device": "1:2 4",
+			},
+		},
+		{
+			name: "weight_device_nil_values",
+			spec: &specs.LinuxBlockIO{
+				WeightDevice: []specs.LinuxWeightDevice{
+					makeLinuxWeightDevice(1, 2, nil, nil),
+				},
+			},
+		},
+		{
+			name: "throttle",
+			spec: &specs.LinuxBlockIO{
+				ThrottleReadBpsDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(1, 2, 3),
+				},
+				ThrottleReadIOPSDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(4, 5, 6),
+				},
+				ThrottleWriteBpsDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(7, 8, 9),
+				},
+				ThrottleWriteIOPSDevice: []specs.LinuxThrottleDevice{
+					makeLinuxThrottleDevice(10, 11, 12),
+				},
+			},
+			wants: map[string]string{
+				"blkio.throttle.read_bps_device":   "1:2 3",
+				"blkio.throttle.read_iops_device":  "4:5 6",
+				"blkio.throttle.write_bps_device":  "7:8 9",
+				"blkio.throttle.write_iops_device": "10:11 12",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxBlockIO{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				BlockIO: tc.spec,
+			}
+			ctrlr := blockIO{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestCPU(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxCPU
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxCPU{
+				Shares:          uint64Ptr(1),
+				Quota:           int64Ptr(2),
+				Period:          uint64Ptr(3),
+				RealtimeRuntime: int64Ptr(4),
+				RealtimePeriod:  uint64Ptr(5),
+			},
+			wants: map[string]string{
+				"cpu.shares":        "1",
+				"cpu.cfs_quota_us":  "2",
+				"cpu.cfs_period_us": "3",
+				"cpu.rt_runtime_us": "4",
+				"cpu.rt_period_us":  "5",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxCPU{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpu{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestCPUSet(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxCPU
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxCPU{
+				Cpus: "foo",
+				Mems: "bar",
+			},
+			wants: map[string]string{
+				"cpuset.cpus": "foo",
+				"cpuset.mems": "bar",
+			},
+		},
+		// Don't test nil values because they are copied from the parent.
+		// See TestCPUSetAncestor().
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpuSet{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+// TestCPUSetAncestor checks that, when not available, value is read from
+// parent directory.
+func TestCPUSetAncestor(t *testing.T) {
+	// Prepare master directory with cgroup files that will be propagated to
+	// children.
+	grandpa, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+	if err != nil {
+		t.Fatalf("error creating temporary directory: %v", err)
+	}
+	defer os.RemoveAll(grandpa)
+
+	if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.cpus"), []byte("parent-cpus"), 0666); err != nil {
+		t.Fatalf("ioutil.WriteFile(): %v", err)
+	}
+	if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.mems"), []byte("parent-mems"), 0666); err != nil {
+		t.Fatalf("ioutil.WriteFile(): %v", err)
+	}
+
+	for _, tc := range []struct {
+		name string
+		spec *specs.LinuxCPU
+	}{
+		{
+			name: "nil_values",
+			spec: &specs.LinuxCPU{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			// Create empty files in intermediate directory. They should be ignored
+			// when reading, and then populated from parent.
+			parent, err := ioutil.TempDir(grandpa, "parent")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(parent)
+			if _, err := os.Create(filepath.Join(parent, "cpuset.cpus")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			if _, err := os.Create(filepath.Join(parent, "cpuset.mems")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+
+			// cgroup files mmust exist.
+			dir, err := ioutil.TempDir(parent, "child")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			if _, err := os.Create(filepath.Join(dir, "cpuset.cpus")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+			if _, err := os.Create(filepath.Join(dir, "cpuset.mems")); err != nil {
+				t.Fatalf("os.Create(): %v", err)
+			}
+
+			spec := &specs.LinuxResources{
+				CPU: tc.spec,
+			}
+			ctrlr := cpuSet{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			want := map[string]string{
+				"cpuset.cpus": "parent-cpus",
+				"cpuset.mems": "parent-mems",
+			}
+			// Both path and dir must have been populated from grandpa.
+			checkDir(t, parent, want)
+			checkDir(t, dir, want)
+		})
+	}
+}
+
+func TestHugeTlb(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  []specs.LinuxHugepageLimit
+		wants map[string]string
+	}{
+		{
+			name: "single",
+			spec: []specs.LinuxHugepageLimit{
+				{
+					Pagesize: "1G",
+					Limit:    123,
+				},
+			},
+			wants: map[string]string{
+				"hugetlb.1G.limit_in_bytes": "123",
+			},
+		},
+		{
+			name: "multiple",
+			spec: []specs.LinuxHugepageLimit{
+				{
+					Pagesize: "1G",
+					Limit:    123,
+				},
+				{
+					Pagesize: "2G",
+					Limit:    456,
+				},
+				{
+					Pagesize: "1P",
+					Limit:    789,
+				},
+			},
+			wants: map[string]string{
+				"hugetlb.1G.limit_in_bytes": "123",
+				"hugetlb.2G.limit_in_bytes": "456",
+				"hugetlb.1P.limit_in_bytes": "789",
+			},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				HugepageLimits: tc.spec,
+			}
+			ctrlr := hugeTLB{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestMemory(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxMemory
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxMemory{
+				Limit:            int64Ptr(1),
+				Reservation:      int64Ptr(2),
+				Swap:             int64Ptr(3),
+				Kernel:           int64Ptr(4),
+				KernelTCP:        int64Ptr(5),
+				Swappiness:       uint64Ptr(6),
+				DisableOOMKiller: boolPtr(true),
+			},
+			wants: map[string]string{
+				"memory.limit_in_bytes":          "1",
+				"memory.soft_limit_in_bytes":     "2",
+				"memory.memsw.limit_in_bytes":    "3",
+				"memory.kmem.limit_in_bytes":     "4",
+				"memory.kmem.tcp.limit_in_bytes": "5",
+				"memory.swappiness":              "6",
+				"memory.oom_control":             "1",
+			},
+		},
+		{
+			// Disable OOM killer should only write when set to true.
+			name: "oomkiller",
+			spec: &specs.LinuxMemory{
+				DisableOOMKiller: boolPtr(false),
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxMemory{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Memory: tc.spec,
+			}
+			ctrlr := memory{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestNetworkClass(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxNetwork
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxNetwork{
+				ClassID: uint32Ptr(1),
+			},
+			wants: map[string]string{
+				"net_cls.classid": "1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxNetwork{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Network: tc.spec,
+			}
+			ctrlr := networkClass{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestNetworkPriority(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxNetwork
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxNetwork{
+				Priorities: []specs.LinuxInterfacePriority{
+					{
+						Name:     "foo",
+						Priority: 1,
+					},
+				},
+			},
+			wants: map[string]string{
+				"net_prio.ifpriomap": "foo 1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxNetwork{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Network: tc.spec,
+			}
+			ctrlr := networkPrio{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
+
+func TestPids(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		spec  *specs.LinuxPids
+		wants map[string]string
+	}{
+		{
+			name: "all",
+			spec: &specs.LinuxPids{Limit: 1},
+			wants: map[string]string{
+				"pids.max": "1",
+			},
+		},
+		{
+			name: "nil_values",
+			spec: &specs.LinuxPids{},
+		},
+		{
+			name: "nil",
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			defer os.RemoveAll(dir)
+
+			spec := &specs.LinuxResources{
+				Pids: tc.spec,
+			}
+			ctrlr := pids{}
+			if err := ctrlr.set(spec, dir); err != nil {
+				t.Fatalf("ctrlr.set(): %v", err)
+			}
+			checkDir(t, dir, tc.wants)
+		})
+	}
+}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 4c2ac6ff0..01204ab4d 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -136,7 +136,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	// Ensure that if there is a panic, all goroutine stacks are printed.
-	debug.SetTraceback("all")
+	debug.SetTraceback("system")
 
 	conf := args[0].(*boot.Config)
 
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 28f0d54b9..3966e2d21 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -168,7 +168,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Start with root mount, then add any other additional mount as needed.
 	ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
 	ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
-		ROMount:      spec.Root.Readonly,
+		ROMount:      spec.Root.Readonly || conf.Overlay,
 		PanicOnWrite: g.panicOnWrite,
 	})
 	if err != nil {
@@ -181,7 +181,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	for _, m := range spec.Mounts {
 		if specutils.Is9PMount(m) {
 			cfg := fsgofer.Config{
-				ROMount:      isReadonlyMount(m.Options),
+				ROMount:      isReadonlyMount(m.Options) || conf.Overlay,
 				PanicOnWrite: g.panicOnWrite,
 				HostUDS:      conf.FSGoferHostUDS,
 			}
@@ -306,7 +306,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 	}
 
 	// Replace the current spec, with the clean spec with symlinks resolved.
-	if err := setupMounts(spec.Mounts, root); err != nil {
+	if err := setupMounts(conf, spec.Mounts, root); err != nil {
 		Fatalf("error setting up FS: %v", err)
 	}
 
@@ -322,7 +322,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 	}
 
 	// Check if root needs to be remounted as readonly.
-	if spec.Root.Readonly {
+	if spec.Root.Readonly || conf.Overlay {
 		// If root is a mount point but not read-only, we can change mount options
 		// to make it read-only for extra safety.
 		log.Infof("Remounting root as readonly: %q", root)
@@ -346,7 +346,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
 // setupMounts binds mount all mounts specified in the spec in their correct
 // location inside root. It will resolve relative paths and symlinks. It also
 // creates directories as needed.
-func setupMounts(mounts []specs.Mount, root string) error {
+func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
 	for _, m := range mounts {
 		if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
 			continue
@@ -358,6 +358,11 @@ func setupMounts(mounts []specs.Mount, root string) error {
 		}
 
 		flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+		if conf.Overlay {
+			// Force mount read-only if writes are not going to be sent to it.
+			flags |= syscall.MS_RDONLY
+		}
+
 		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
 		if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
 			return fmt.Errorf("mounting %v: %v", m, err)
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
index c7d210140..cd85dabbb 100644
--- a/runsc/cmd/help.go
+++ b/runsc/cmd/help.go
@@ -65,16 +65,10 @@ func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}
 	switch f.NArg() {
 	case 0:
 		fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
-		fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
-Container Initiative (OCI) format. Applications run by runsc are run in an
-isolated gVisor sandbox that emulates a Linux environment.
+		fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime.
 
-gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system call interface. It provides an additional layer
-of isolation between running applications and the host operating system.
-
-Functionality is provided by subcommands. For additonal help on individual
-subcommands use "%s %s <subcommand>".
+Functionality is provided by subcommands. For help with a specific subcommand,
+use "%s %s <subcommand>".
 
 `, h.cdr.Name(), h.Name())
 		h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 8e2b36e85..a2b0a4b14 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,6 +16,7 @@ package cmd
 
 import (
 	"context"
+	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -24,7 +25,8 @@ import (
 	"gvisor.dev/gvisor/runsc/flag"
 )
 
-var specTemplate = []byte(`{
+func genSpec(cwd string) []byte {
+	var template = fmt.Sprintf(`{
 	"ociVersion": "1.0.0",
 	"process": {
 		"terminal": true,
@@ -39,7 +41,7 @@ var specTemplate = []byte(`{
 			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
 			"TERM=xterm"
 		],
-		"cwd": "/",
+		"cwd": "%s",
 		"capabilities": {
 			"bounding": [
 				"CAP_AUDIT_WRITE",
@@ -123,11 +125,15 @@ var specTemplate = []byte(`{
 			}
 		]
 	}
-}`)
+}`, cwd)
+
+	return []byte(template)
+}
 
 // Spec implements subcommands.Command for the "spec" command.
 type Spec struct {
 	bundle string
+	cwd    string
 }
 
 // Name implements subcommands.Command.Name.
@@ -165,6 +171,8 @@ EXAMPLE:
 // SetFlags implements subcommands.Command.SetFlags.
 func (s *Spec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+	f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
+		"this value MUST be an absolute path")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -174,7 +182,9 @@ func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("file %q already exists", confPath)
 	}
 
-	if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+	var spec = genSpec(s.cwd)
+
+	if err := ioutil.WriteFile(confPath, spec, 0664); err != nil {
 		Fatalf("writing to %q: %v", confPath, err)
 	}
 
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 46154df60..49cfb0837 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -16,6 +16,7 @@ go_library(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/cleanup",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/sighandling",
@@ -46,13 +47,14 @@ go_test(
         "//test/cmd/test_app",
     ],
     library = ":container",
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "requires-kvm",
     ],
     deps = [
         "//pkg/abi/linux",
         "//pkg/bits",
+        "//pkg/cleanup",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 294dca5e7..3813c6b93 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -119,7 +119,7 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
 
 // Test that an pty FD is sent over the console socket if one is provided.
 func TestConsoleSocket(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			spec := testutil.NewSpecWithArgs("true")
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8539f252d..6d297d0df 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -31,6 +31,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/sighandling"
@@ -293,7 +294,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	}
 	// The Cleanup object cleans up partially created containers when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
-	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	cu := cleanup.Make(func() { _ = c.Destroy() })
 	defer cu.Clean()
 
 	// Lock the container metadata file to prevent concurrent creations of
@@ -402,7 +403,7 @@ func (c *Container) Start(conf *boot.Config) error {
 	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	unlock := cleanup.Make(func() { c.Saver.unlock() })
 	defer unlock.Clean()
 
 	if err := c.requireStatus("start", Created); err != nil {
@@ -506,7 +507,7 @@ func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
 	}
 	// Clean up partially created container if an error occurs.
 	// Any errors returned by Destroy() itself are ignored.
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		c.Destroy()
 	})
 	defer cu.Clean()
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7ba301331..cd76645bd 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math"
 	"os"
 	"path"
 	"path/filepath"
@@ -53,9 +54,8 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if r, err := procListsEqual(got, want); !r {
-			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
-				procListToString(got), procListToString(want), err)
+		if !procListsEqual(got, want) {
+			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
 		}
 		return nil
 	}
@@ -92,36 +92,72 @@ func blockUntilWaitable(pid int) error {
 	return err
 }
 
-// procListsEqual is used to check whether 2 Process lists are equal for all
-// implemented fields.
-func procListsEqual(got, want []*control.Process) (bool, error) {
-	if len(got) != len(want) {
-		return false, nil
-	}
-	for i := range got {
-		pd1 := got[i]
-		pd2 := want[i]
-		// Zero out timing dependant fields.
-		pd1.Time = ""
-		pd1.STime = ""
-		pd1.C = 0
-		// Ignore TTY field too, since it's not relevant in the cases
-		// where we use this method. Tests that care about the TTY
-		// field should check for it themselves.
-		pd1.TTY = ""
-		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
-		if err != nil {
-			return false, err
+// procListsEqual is used to check whether 2 Process lists are equal. Fields
+// set to -1 in wants are ignored. Timestamp and threads fields are always
+// ignored.
+func procListsEqual(gots, wants []*control.Process) bool {
+	if len(gots) != len(wants) {
+		return false
+	}
+	for i := range gots {
+		got := gots[i]
+		want := wants[i]
+
+		if want.UID != math.MaxUint32 && want.UID != got.UID {
+			return false
 		}
-		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
-		if err != nil {
-			return false, err
+		if want.PID != -1 && want.PID != got.PID {
+			return false
 		}
-		if pd1Json != pd2Json {
-			return false, nil
+		if want.PPID != -1 && want.PPID != got.PPID {
+			return false
 		}
+		if len(want.TTY) != 0 && want.TTY != got.TTY {
+			return false
+		}
+		if len(want.Cmd) != 0 && want.Cmd != got.Cmd {
+			return false
+		}
+	}
+	return true
+}
+
+type processBuilder struct {
+	process control.Process
+}
+
+func newProcessBuilder() *processBuilder {
+	return &processBuilder{
+		process: control.Process{
+			UID:  math.MaxUint32,
+			PID:  -1,
+			PPID: -1,
+		},
 	}
-	return true, nil
+}
+
+func (p *processBuilder) Cmd(cmd string) *processBuilder {
+	p.process.Cmd = cmd
+	return p
+}
+
+func (p *processBuilder) PID(pid kernel.ThreadID) *processBuilder {
+	p.process.PID = pid
+	return p
+}
+
+func (p *processBuilder) PPID(ppid kernel.ThreadID) *processBuilder {
+	p.process.PPID = ppid
+	return p
+}
+
+func (p *processBuilder) UID(uid auth.KUID) *processBuilder {
+	p.process.UID = uid
+	return p
+}
+
+func (p *processBuilder) Process() *control.Process {
+	return &p.process
 }
 
 func procListToString(pl []*control.Process) string {
@@ -256,8 +292,6 @@ var (
 func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	// Always load the default config.
 	cs := make(map[string]*boot.Config)
-	cs["default"] = testutil.TestConfig(t)
-
 	for _, o := range opts {
 		switch o {
 		case overlay:
@@ -285,9 +319,16 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
 
 func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
 	vfs1 := configs(t, opts...)
-	vfs2 := configs(t, opts...)
 
-	for key, value := range vfs2 {
+	var optsVFS2 []configOption
+	for _, opt := range opts {
+		// TODO(gvisor.dev/issue/1487): Enable overlay tests.
+		if opt != overlay {
+			optsVFS2 = append(optsVFS2, opt)
+		}
+	}
+
+	for key, value := range configs(t, optsVFS2...) {
 		value.VFS2 = true
 		vfs1[key+"VFS2"] = value
 	}
@@ -318,14 +359,7 @@ func TestLifecycle(t *testing.T) {
 
 			// expectedPL lists the expected process state of the container.
 			expectedPL := []*control.Process{
-				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
-				},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
 			// Create the container.
 			args := Args{
@@ -603,10 +637,14 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
 
 // TestExec verifies that a container can exec a new program.
 func TestExec(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			const uid = 343
-			spec := testutil.NewSpecWithArgs("sleep", "100")
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "exec-test")
+			if err != nil {
+				t.Fatalf("error creating temporary directory: %v", err)
+			}
+			cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100", dir)
+			spec := testutil.NewSpecWithArgs("sh", "-c", cmd)
 
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
@@ -629,29 +667,127 @@ func TestExec(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			// expectedPL lists the expected process state of the container.
+			// Wait until sleep is running to ensure the symlink was created.
 			expectedPL := []*control.Process{
+				newProcessBuilder().Cmd("sh").Process(),
+				newProcessBuilder().Cmd("sleep").Process(),
+			}
+			if err := waitForProcessList(cont, expectedPL); err != nil {
+				t.Fatalf("waitForProcessList: %v", err)
+			}
+
+			for _, tc := range []struct {
+				name string
+				args control.ExecArgs
+			}{
+				{
+					name: "complete",
+					args: control.ExecArgs{
+						Filename: "/bin/true",
+						Argv:     []string{"/bin/true"},
+					},
+				},
+				{
+					name: "filename",
+					args: control.ExecArgs{
+						Filename: "/bin/true",
+					},
+				},
+				{
+					name: "argv",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/true"},
+					},
+				},
+				{
+					name: "filename resolution",
+					args: control.ExecArgs{
+						Filename: "true",
+						Envv:     []string{"PATH=/bin"},
+					},
+				},
 				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
+					name: "argv resolution",
+					args: control.ExecArgs{
+						Argv: []string{"true"},
+						Envv: []string{"PATH=/bin"},
+					},
 				},
 				{
-					UID:     uid,
-					PID:     2,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{2},
+					name: "argv symlink",
+					args: control.ExecArgs{
+						Argv: []string{filepath.Join(dir, "symlink")},
+					},
 				},
+				{
+					name: "working dir",
+					args: control.ExecArgs{
+						Argv:             []string{"/bin/sh", "-c", `if [[ "${PWD}" != "/tmp" ]]; then exit 1; fi`},
+						WorkingDirectory: "/tmp",
+					},
+				},
+				{
+					name: "user",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "$(id -u)" != "343" ]]; then exit 1; fi`},
+						KUID: 343,
+					},
+				},
+				{
+					name: "group",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "$(id -g)" != "343" ]]; then exit 1; fi`},
+						KGID: 343,
+					},
+				},
+				{
+					name: "env",
+					args: control.ExecArgs{
+						Argv: []string{"/bin/sh", "-c", `if [[ "${FOO}" != "123" ]]; then exit 1; fi`},
+						Envv: []string{"FOO=123"},
+					},
+				},
+			} {
+				t.Run(tc.name, func(t *testing.T) {
+					// t.Parallel()
+					if ws, err := cont.executeSync(&tc.args); err != nil {
+						t.Fatalf("executeAsync(%+v): %v", tc.args, err)
+					} else if ws != 0 {
+						t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws)
+					}
+				})
 			}
+		})
+	}
+}
 
-			// Verify that "sleep 100" is running.
-			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
-				t.Error(err)
+// TestExecProcList verifies that a container can exec a new program and it
+// shows correcly in the process list.
+func TestExecProcList(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			const uid = 343
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
 			}
 
 			execArgs := &control.ExecArgs{
@@ -661,9 +797,8 @@ func TestExec(t *testing.T) {
 				KUID:             uid,
 			}
 
-			// Verify that "sleep 100" and "sleep 5" are running
-			// after exec.  First, start running exec (whick
-			// blocks).
+			// Verify that "sleep 100" and "sleep 5" are running after exec. First,
+			// start running exec (which blocks).
 			ch := make(chan error)
 			go func() {
 				exitStatus, err := cont.executeSync(execArgs)
@@ -676,6 +811,11 @@ func TestExec(t *testing.T) {
 				}
 			}()
 
+			// expectedPL lists the expected process state of the container.
+			expectedPL := []*control.Process{
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").UID(0).Process(),
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").UID(uid).Process(),
+			}
 			if err := waitForProcessList(cont, expectedPL); err != nil {
 				t.Fatalf("error waiting for processes: %v", err)
 			}
@@ -695,7 +835,7 @@ func TestExec(t *testing.T) {
 
 // TestKillPid verifies that we can signal individual exec'd processes.
 func TestKillPid(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, overlay) {
 		t.Run(name, func(t *testing.T) {
 			app, err := testutil.FindFile("test/cmd/test_app/test_app")
 			if err != nil {
@@ -1211,7 +1351,7 @@ func TestCapabilities(t *testing.T) {
 	uid := auth.KUID(os.Getuid() + 1)
 	gid := auth.KGID(os.Getgid() + 1)
 
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			spec := testutil.NewSpecWithArgs("sleep", "100")
 			rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
@@ -1237,24 +1377,9 @@ func TestCapabilities(t *testing.T) {
 
 			// expectedPL lists the expected process state of the container.
 			expectedPL := []*control.Process{
-				{
-					UID:     0,
-					PID:     1,
-					PPID:    0,
-					C:       0,
-					Cmd:     "sleep",
-					Threads: []kernel.ThreadID{1},
-				},
-				{
-					UID:     uid,
-					PID:     2,
-					PPID:    0,
-					C:       0,
-					Cmd:     "exe",
-					Threads: []kernel.ThreadID{2},
-				},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
-			if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+			if err := waitForProcessList(cont, expectedPL); err != nil {
 				t.Fatalf("Failed to wait for sleep to start, err: %v", err)
 			}
 
@@ -1409,7 +1534,7 @@ func TestReadonlyRoot(t *testing.T) {
 }
 
 func TestUIDMap(t *testing.T) {
-	for name, conf := range configs(t, noOverlay...) {
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
 			if err != nil {
@@ -1537,28 +1662,6 @@ func TestReadonlyMount(t *testing.T) {
 	}
 }
 
-func TestBindMountByOption(t *testing.T) {
-	for _, conf := range configs(t, overlay) {
-		t.Logf("Running test with conf: %+v", conf)
-
-		dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount")
-		spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-		if err != nil {
-			t.Fatalf("ioutil.TempDir() failed: %v", err)
-		}
-		spec.Mounts = append(spec.Mounts, specs.Mount{
-			Destination: dir,
-			Source:      dir,
-			Type:        "none",
-			Options:     []string{"rw", "bind"},
-		})
-
-		if err := run(spec, conf); err != nil {
-			t.Fatalf("error running sandbox: %v", err)
-		}
-	}
-}
-
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
@@ -1760,7 +1863,7 @@ func TestUserLog(t *testing.T) {
 	if err != nil {
 		t.Fatalf("error opening user log file %q: %v", userLog, err)
 	}
-	if want := "Unsupported syscall: sched_rr_get_interval"; !strings.Contains(string(out), want) {
+	if want := "Unsupported syscall sched_rr_get_interval("; !strings.Contains(string(out), want) {
 		t.Errorf("user log file doesn't contain %q, out: %s", want, string(out))
 	}
 }
@@ -1908,7 +2011,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 }
 
 func TestCreateWorkingDir(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, overlay) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
 			if err != nil {
@@ -2031,7 +2134,7 @@ func TestMountPropagation(t *testing.T) {
 }
 
 func TestMountSymlink(t *testing.T) {
-	for name, conf := range configs(t, overlay) {
+	for name, conf := range configsWithVFS2(t, overlay) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
 			if err != nil {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index f6861b1dd..c2b54696c 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -27,6 +27,7 @@ import (
 	"time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -64,29 +65,16 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
-	var (
-		containers []*Container
-		cleanups   []func()
-	)
-	cleanups = append(cleanups, func() {
-		for _, c := range containers {
-			c.Destroy()
-		}
-	})
-	cleanupAll := func() {
-		for _, c := range cleanups {
-			c()
-		}
-	}
-	localClean := specutils.MakeCleanup(cleanupAll)
-	defer localClean.Clean()
+	cu := cleanup.Cleanup{}
+	defer cu.Clean()
 
+	var containers []*Container
 	for i, spec := range specs {
 		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			return nil, nil, fmt.Errorf("error setting up container: %v", err)
 		}
-		cleanups = append(cleanups, cleanup)
+		cu.Add(cleanup)
 
 		args := Args{
 			ID:        ids[i],
@@ -97,6 +85,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		if err != nil {
 			return nil, nil, fmt.Errorf("error creating container: %v", err)
 		}
+		cu.Add(func() { cont.Destroy() })
 		containers = append(containers, cont)
 
 		if err := cont.Start(conf); err != nil {
@@ -104,8 +93,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		}
 	}
 
-	localClean.Release()
-	return containers, cleanupAll, nil
+	return containers, cu.Release(), nil
 }
 
 type execDesc struct {
@@ -141,7 +129,7 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -161,13 +149,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
 			expectedPL = []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -207,13 +195,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
 			expectedPL = []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -269,7 +257,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 			// Check via ps that multiple processes are running.
 			expectedPL := []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -279,7 +267,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 			}
 
 			expectedPL = []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -312,7 +300,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+		newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -357,7 +345,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -389,7 +377,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+		newProcessBuilder().Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -424,7 +412,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().PID(1).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -510,9 +498,8 @@ func TestMultiContainerSignal(t *testing.T) {
 
 			// Check via ps that container 1 process is running.
 			expectedPL := []*control.Process{
-				{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+				newProcessBuilder().Cmd("sleep").Process(),
 			}
-
 			if err := waitForProcessList(containers[1], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
 			}
@@ -524,7 +511,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 			// Make sure process 1 is still running.
 			expectedPL = []*control.Process{
-				{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
 			}
 			if err := waitForProcessList(containers[0], expectedPL); err != nil {
 				t.Errorf("failed to wait for sleep to start: %v", err)
@@ -638,8 +625,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 			if err != nil {
 				t.Fatalf("error getting process data from sandbox: %v", err)
 			}
-			expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
-			if r, err := procListsEqual(pss, expectedPL); !r {
+			expectedPL := []*control.Process{
+				newProcessBuilder().PID(1).Cmd("sleep").Process(),
+			}
+			if !procListsEqual(pss, expectedPL) {
 				t.Errorf("container got process list: %s, want: %s: error: %v",
 					procListToString(pss), procListToString(expectedPL), err)
 			}
@@ -676,7 +665,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().PID(1).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -684,8 +673,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
-		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+		newProcessBuilder().PID(2).Cmd("sh").Process(),
+		newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -699,7 +688,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
+	expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process())
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -1394,7 +1383,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
 		Destination: "/mydir/test",
 		Source:      "/some/dir",
 		Type:        "tmpfs",
-		Options:     []string{"rw", "relatime"},
+		Options:     []string{"rw", "rbind", "relatime"},
 	}
 	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
 
@@ -1517,7 +1506,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+		newProcessBuilder().PID(3).Cmd("sleep").Process(),
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1545,7 +1534,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
+			newProcessBuilder().PID(kernel.ThreadID(i + 1)).Cmd("sleep").Process(),
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1565,7 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
+		newProcessBuilder().Cmd("non-existent-process").Process(),
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 64a406ae2..1036b0630 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -13,12 +13,12 @@ go_library(
     visibility = ["//runsc:__subpackages__"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/cleanup",
         "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sync",
         "//pkg/syserr",
-        "//runsc/specutils",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1942f50d7..edc239013 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -33,11 +33,11 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 const (
@@ -439,7 +439,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
 	if err != nil {
 		return nil, nil, p9.QID{}, 0, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		child.Close()
 		// Best effort attempt to remove the file in case of failure.
 		if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
@@ -480,7 +480,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the dir in case of failure.
 		if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
 			log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
@@ -864,7 +864,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
 		return p9.QID{}, extractErrno(err)
 	}
-	cu := specutils.MakeCleanup(func() {
+	cu := cleanup.Make(func() {
 		// Best effort attempt to remove the symlink in case of failure.
 		if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
 			log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index c95d50294..035dcd3e3 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -13,6 +13,7 @@ go_library(
         "//runsc:__subpackages__",
     ],
     deps = [
+        "//pkg/cleanup",
         "//pkg/control/client",
         "//pkg/control/server",
         "//pkg/log",
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e4ec16e2f..6e1a2af25 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -30,6 +30,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/syndtr/gocapability/capability"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/control/client"
 	"gvisor.dev/gvisor/pkg/control/server"
 	"gvisor.dev/gvisor/pkg/log"
@@ -119,7 +120,7 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
 	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
 	// The Cleanup object cleans up partially created sandboxes when an error
 	// occurs. Any errors occurring during cleanup itself are ignored.
-	c := specutils.MakeCleanup(func() {
+	c := cleanup.Make(func() {
 		err := s.destroy()
 		log.Warningf("error destroying sandbox: %v", err)
 	})
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 60bb7b7ee..23001d67c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"runtime"
 	"syscall"
@@ -261,7 +262,18 @@ func MaybeRunAsRoot() error {
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("re-executing self: %w", err)
+	}
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch)
+	go func() {
+		for {
+			// Forward all signals to child process.
+			cmd.Process.Signal(<-ch)
+		}
+	}()
+	if err := cmd.Wait(); err != nil {
 		if exit, ok := err.(*exec.ExitError); ok {
 			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
 				os.Exit(ws.ExitStatus())
@@ -269,7 +281,7 @@ func MaybeRunAsRoot() error {
 			log.Warningf("No wait status provided, exiting with -1: %v", err)
 			os.Exit(-1)
 		}
-		return fmt.Errorf("re-executing self: %v", err)
+		return err
 	}
 	// Child completed with success.
 	os.Exit(0)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 202518b58..f1fa573c5 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -311,19 +311,7 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.
 
 // Is9PMount returns true if the given mount can be mounted as an external gofer.
 func Is9PMount(m specs.Mount) bool {
-	var isBind bool
-	switch m.Type {
-	case "bind":
-		isBind = true
-	default:
-		for _, opt := range m.Options {
-			if opt == "bind" || opt == "rbind" {
-				isBind = true
-				break
-			}
-		}
-	}
-	return isBind && m.Source != "" && IsSupportedDevMount(m)
+	return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
 }
 
 // IsSupportedDevMount returns true if the mount is a supported /dev mount.
@@ -456,36 +444,6 @@ func ContainsStr(strs []string, str string) bool {
 	return false
 }
 
-// Cleanup allows defers to be aborted when cleanup needs to happen
-// conditionally. Usage:
-// c := MakeCleanup(func() { f.Close() })
-// defer c.Clean() // any failure before release is called will close the file.
-// ...
-// c.Release() // on success, aborts closing the file and return it.
-// return f
-type Cleanup struct {
-	clean func()
-}
-
-// MakeCleanup creates a new Cleanup object.
-func MakeCleanup(f func()) Cleanup {
-	return Cleanup{clean: f}
-}
-
-// Clean calls the cleanup function.
-func (c *Cleanup) Clean() {
-	if c.clean != nil {
-		c.clean()
-		c.clean = nil
-	}
-}
-
-// Release releases the cleanup from its duties, i.e. cleanup function is not
-// called after this point.
-func (c *Cleanup) Release() {
-	c.clean = nil
-}
-
 // RetryEintr retries the function until an error different than EINTR is
 // returned.
 func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
diff --git a/scripts/build.sh b/scripts/build.sh
deleted file mode 100755
index e821e8624..000000000
--- a/scripts/build.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-source $(dirname $0)/common.sh
-
-# Build runsc.
-runsc=$(build -c opt //runsc)
-
-# Build packages.
-pkgs=$(build -c opt //runsc:runsc-debian)
-
-# Stop here if we have no artifacts directory.
-[[ -v KOKORO_ARTIFACTS_DIR ]] || exit 0
-
-# install_raw installs raw artifacts.
-install_raw() {
-  mkdir -p "$1"
-  cp -f "${runsc}" "$1"/runsc
-  sha512sum "$1"/runsc | awk '{print $1 "  runsc"}' > "$1"/runsc.sha512
-}
-
-# Build a repository, if the key is available.
-#
-# Note that make_repository.sh script will install packages into the provided
-# root, but will output to stdout a directory that can be copied arbitrarily
-# into "${KOKORO_ARTIFACTS_DIR}"/dists/XXX. We do things this way because we
-# will copy the same repository structure into multiple locations, below.
-if [[ -v KOKORO_REPO_KEY ]]; then
-  repo=$(tools/make_repository.sh \
-          "${KOKORO_KEYSTORE_DIR}/${KOKORO_REPO_KEY}" \
-          gvisor-bot@google.com \
-          "${KOKORO_ARTIFACTS_DIR}" \
-          ${pkgs})
-fi
-
-# install_repo installs a repository.
-#
-# Note that packages are already installed, as noted above.
-install_repo() {
-  if [[ -v repo ]]; then
-    rm -rf "$1" && mkdir -p "$(dirname "$1")" && cp -a "${repo}" "$1"
-  fi
-}
-
-# If nightly, install only nightly artifacts.
-if [[ "${KOKORO_BUILD_NIGHTLY:-false}" == "true" ]]; then
-  # The "latest" directory and current date.
-  stamp="$(date -Idate)"
-  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/latest"
-  install_raw  "${KOKORO_ARTIFACTS_DIR}/nightly/${stamp}"
-  install_repo "${KOKORO_ARTIFACTS_DIR}/dists/nightly"
-else
-  # Is it a tagged release? Build that.
-  tags="$(git tag --points-at HEAD)"
-  if ! [[ -z "${tags}" ]]; then
-    # Note that a given commit can match any number of tags. We have to iterate
-    # through all possible tags and produce associated artifacts.
-    for tag in ${tags}; do
-      name=$(echo "${tag}" | cut -d'-' -f2)
-      base=$(echo "${name}" | cut -d'.' -f1)
-      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/${name}"
-      install_raw  "${KOKORO_ARTIFACTS_DIR}/release/latest"
-      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/release"
-      install_repo "${KOKORO_ARTIFACTS_DIR}/dists/${base}"
-    done
-  else
-    # Otherwise, assume it is a raw master commit.
-    install_raw  "${KOKORO_ARTIFACTS_DIR}/master/latest"
-    install_repo "${KOKORO_ARTIFACTS_DIR}/dists/master"
-  fi
-fi
diff --git a/scripts/common_build.sh b/scripts/common_build.sh
index 4fe1067d2..0d9a191b5 100755
--- a/scripts/common_build.sh
+++ b/scripts/common_build.sh
@@ -63,6 +63,10 @@ function run_as_root() {
   bazel run --run_under="sudo" "${binary}" -- "$@"
 }
 
+function query() {
+  QUERY_RESULT=$(bazel query "$@")
+}
+
 function collect_logs() {
   # Zip out everything into a convenient form.
   if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then
diff --git a/scripts/packetdrill_tests.sh b/scripts/packetdrill_tests.sh
index f0fc444c8..727503bce 100755
--- a/scripts/packetdrill_tests.sh
+++ b/scripts/packetdrill_tests.sh
@@ -19,4 +19,5 @@ source $(dirname $0)/common.sh
 make load-packetdrill
 
 install_runsc_for_test runsc-d
-test_runsc $(bazel query "attr(tags, manual, tests(//test/packetdrill/...))")
+query "attr(tags, manual, tests(//test/packetdrill/...))"
+test_runsc $QUERY_RESULT
diff --git a/scripts/packetimpact_tests.sh b/scripts/packetimpact_tests.sh
index 17fc43f27..51c11f23f 100755
--- a/scripts/packetimpact_tests.sh
+++ b/scripts/packetimpact_tests.sh
@@ -19,4 +19,5 @@ source $(dirname $0)/common.sh
 make load-packetimpact
 
 install_runsc_for_test runsc-d
-test_runsc $(bazel query "attr(tags, packetimpact, tests(//test/packetimpact/...))")
+query "attr(tags, packetimpact, tests(//test/packetimpact/...))"
+test_runsc $QUERY_RESULT
diff --git a/scripts/release.sh b/scripts/release.sh
deleted file mode 100755
index ac7eff3ef..000000000
--- a/scripts/release.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cd $(dirname $0)/..
-source scripts/common.sh
-
-# Tag a release only if provided.
-if ! [[ -v KOKORO_RELEASE_COMMIT ]]; then
-  echo "No KOKORO_RELEASE_COMMIT provided." >&2
-  exit 1
-fi
-if ! [[ -v KOKORO_RELEASE_TAG ]]; then
-  echo "No KOKORO_RELEASE_TAG provided." >&2
-  exit 1
-fi
-if ! [[ -v KOKORO_RELNOTES ]]; then
-  echo "No KOKORO_RELNOTES provided." >&2
-  exit 1
-fi
-if ! [[ -r "${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}" ]]; then
-  echo "The file '${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}' is not readable." >&2
-  exit 1
-fi
-
-# Unless an explicit releaser is provided, use the bot e-mail.
-declare -r KOKORO_RELEASE_AUTHOR=${KOKORO_RELEASE_AUTHOR:-gvisor-bot}
-declare -r EMAIL=${EMAIL:-${KOKORO_RELEASE_AUTHOR}@google.com}
-
-# Ensure we have an appropriate configuration for the tag.
-git config --get user.name || git config user.name "gVisor-bot"
-git config --get user.email || git config user.email "${EMAIL}"
-
-# Provide a credential if available.
-if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
-  git config --global credential.helper cache
-  git credential approve <<EOF
-protocol=https
-host=github.com
-username=$(cat "${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}")
-password=x-oauth-basic
-EOF
-fi
-
-# Run the release tool, which pushes to the origin repository.
-tools/tag_release.sh \
-    "${KOKORO_RELEASE_COMMIT}" \
-    "${KOKORO_RELEASE_TAG}" \
-    "${KOKORO_ARTIFACTS_DIR}/${KOKORO_RELNOTES}"
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index ff856883a..91c956e10 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -239,7 +239,9 @@ func TestMemLimit(t *testing.T) {
 	d := dockerutil.MakeDocker(t)
 	defer d.CleanUp()
 
-	allocMemory := 500 * 1024
+	// N.B. Because the size of the memory file may grow in large chunks,
+	// there is a minimum threshold of 1GB for the MemTotal figure.
+	allocMemory := 1024 * 1024
 	out, err := d.Run(dockerutil.RunOpts{
 		Image:  "basic/alpine",
 		Memory: allocMemory, // In kB.
@@ -337,27 +339,53 @@ func TestJobControl(t *testing.T) {
 	}
 }
 
-// TestTmpFile checks that files inside '/tmp' are not overridden. In addition,
-// it checks that working dir is created if it doesn't exit.
+// TestWorkingDirCreation checks that working dir is created if it doesn't exit.
+func TestWorkingDirCreation(t *testing.T) {
+	for _, tc := range []struct {
+		name       string
+		workingDir string
+	}{
+		{name: "root", workingDir: "/foo"},
+		{name: "tmp", workingDir: "/tmp/foo"},
+	} {
+		for _, readonly := range []bool{true, false} {
+			name := tc.name
+			if readonly {
+				name += "-readonly"
+			}
+			t.Run(name, func(t *testing.T) {
+				d := dockerutil.MakeDocker(t)
+				defer d.CleanUp()
+
+				opts := dockerutil.RunOpts{
+					Image:    "basic/alpine",
+					WorkDir:  tc.workingDir,
+					ReadOnly: readonly,
+				}
+				got, err := d.Run(opts, "sh", "-c", "echo ${PWD}")
+				if err != nil {
+					t.Fatalf("docker run failed: %v", err)
+				}
+				if want := tc.workingDir + "\n"; want != got {
+					t.Errorf("invalid working dir, want: %q, got: %q", want, got)
+				}
+			})
+		}
+	}
+}
+
+// TestTmpFile checks that files inside '/tmp' are not overridden.
 func TestTmpFile(t *testing.T) {
 	d := dockerutil.MakeDocker(t)
 	defer d.CleanUp()
 
-	// Should work without ReadOnly
-	if _, err := d.Run(dockerutil.RunOpts{
-		Image:   "basic/alpine",
-		WorkDir: "/tmp/foo/bar",
-	}, "touch", "/tmp/foo/bar/file"); err != nil {
+	opts := dockerutil.RunOpts{Image: "tmpfile"}
+	got, err := d.Run(opts, "cat", "/tmp/foo/file.txt")
+	if err != nil {
 		t.Fatalf("docker run failed: %v", err)
 	}
-
-	// Expect failure.
-	if _, err := d.Run(dockerutil.RunOpts{
-		Image:    "basic/alpine",
-		WorkDir:  "/tmp/foo/bar",
-		ReadOnly: true,
-	}, "touch", "/tmp/foo/bar/file"); err == nil {
-		t.Fatalf("docker run expected failure, but succeeded")
+	if want := "123\n"; want != got {
+		t.Errorf("invalid file content, want: %q, got: %q", want, got)
 	}
 }
 
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
index 41e0cfa8d..872021358 100644
--- a/test/iptables/filter_input.go
+++ b/test/iptables/filter_input.go
@@ -49,6 +49,8 @@ func init() {
 	RegisterTestCase(FilterInputJumpTwice{})
 	RegisterTestCase(FilterInputDestination{})
 	RegisterTestCase(FilterInputInvertDestination{})
+	RegisterTestCase(FilterInputSource{})
+	RegisterTestCase(FilterInputInvertSource{})
 }
 
 // FilterInputDropUDP tests that we can drop UDP traffic.
@@ -667,3 +669,61 @@ func (FilterInputInvertDestination) ContainerAction(ip net.IP) error {
 func (FilterInputInvertDestination) LocalAction(ip net.IP) error {
 	return sendUDPLoop(ip, acceptPort, sendloopDuration)
 }
+
+// FilterInputSource verifies that we can filter packets via `-s
+// <ipaddr>`.
+type FilterInputSource struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputSource) Name() string {
+	return "FilterInputSource"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputSource) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets from this
+	// machine.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "-s", fmt.Sprintf("%v", ip), "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputSource) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
+
+// FilterInputInvertSource verifies that we can filter packets via `! -s
+// <ipaddr>`.
+type FilterInputInvertSource struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputInvertSource) Name() string {
+	return "FilterInputInvertSource"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputInvertSource) ContainerAction(ip net.IP) error {
+	// Make INPUT's default action DROP, then ACCEPT all packets not bound
+	// for 127.0.0.1.
+	rules := [][]string{
+		{"-P", "INPUT", "DROP"},
+		{"-A", "INPUT", "!", "-s", localIP, "-j", "ACCEPT"},
+	}
+	if err := filterTableRules(rules); err != nil {
+		return err
+	}
+
+	return listenUDP(acceptPort, sendloopDuration)
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputInvertSource) LocalAction(ip net.IP) error {
+	return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index 4fd2cb46a..38319a3b2 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -302,3 +302,15 @@ func TestNATPreRedirectInvert(t *testing.T) {
 func TestNATRedirectRequiresProtocol(t *testing.T) {
 	singleTest(t, NATRedirectRequiresProtocol{})
 }
+
+func TestNATLoopbackSkipsPrerouting(t *testing.T) {
+	singleTest(t, NATLoopbackSkipsPrerouting{})
+}
+
+func TestInputSource(t *testing.T) {
+	singleTest(t, FilterInputSource{})
+}
+
+func TestInputInvertSource(t *testing.T) {
+	singleTest(t, FilterInputInvertSource{})
+}
diff --git a/test/iptables/nat.go b/test/iptables/nat.go
index 0a10ce7fe..5e54a3963 100644
--- a/test/iptables/nat.go
+++ b/test/iptables/nat.go
@@ -39,6 +39,7 @@ func init() {
 	RegisterTestCase(NATOutDontRedirectIP{})
 	RegisterTestCase(NATOutRedirectInvert{})
 	RegisterTestCase(NATRedirectRequiresProtocol{})
+	RegisterTestCase(NATLoopbackSkipsPrerouting{})
 }
 
 // NATPreRedirectUDPPort tests that packets are redirected to different port.
@@ -326,32 +327,6 @@ func (NATRedirectRequiresProtocol) LocalAction(ip net.IP) error {
 	return nil
 }
 
-// loopbackTests runs an iptables rule and ensures that packets sent to
-// dest:dropPort are received by localhost:acceptPort.
-func loopbackTest(dest net.IP, args ...string) error {
-	if err := natTable(args...); err != nil {
-		return err
-	}
-	sendCh := make(chan error)
-	listenCh := make(chan error)
-	go func() {
-		sendCh <- sendUDPLoop(dest, dropPort, sendloopDuration)
-	}()
-	go func() {
-		listenCh <- listenUDP(acceptPort, sendloopDuration)
-	}()
-	select {
-	case err := <-listenCh:
-		if err != nil {
-			return err
-		}
-	case <-time.After(sendloopDuration):
-		return errors.New("timed out")
-	}
-	// sendCh will always take the full sendloop time.
-	return <-sendCh
-}
-
 // NATOutRedirectTCPPort tests that connections are redirected on specified ports.
 type NATOutRedirectTCPPort struct{}
 
@@ -400,3 +375,65 @@ func (NATOutRedirectTCPPort) ContainerAction(ip net.IP) error {
 func (NATOutRedirectTCPPort) LocalAction(ip net.IP) error {
 	return nil
 }
+
+// NATLoopbackSkipsPrerouting tests that packets sent via loopback aren't
+// affected by PREROUTING rules.
+type NATLoopbackSkipsPrerouting struct{}
+
+// Name implements TestCase.Name.
+func (NATLoopbackSkipsPrerouting) Name() string {
+	return "NATLoopbackSkipsPrerouting"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (NATLoopbackSkipsPrerouting) ContainerAction(ip net.IP) error {
+	// Redirect anything sent to localhost to an unused port.
+	dest := []byte{127, 0, 0, 1}
+	if err := natTable("-A", "PREROUTING", "-p", "tcp", "-j", "REDIRECT", "--to-port", fmt.Sprintf("%d", dropPort)); err != nil {
+		return err
+	}
+
+	// Establish a connection via localhost. If the PREROUTING rule did apply to
+	// loopback traffic, the connection would fail.
+	sendCh := make(chan error)
+	go func() {
+		sendCh <- connectTCP(dest, acceptPort, sendloopDuration)
+	}()
+
+	if err := listenTCP(acceptPort, sendloopDuration); err != nil {
+		return err
+	}
+	return <-sendCh
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (NATLoopbackSkipsPrerouting) LocalAction(ip net.IP) error {
+	// No-op.
+	return nil
+}
+
+// loopbackTests runs an iptables rule and ensures that packets sent to
+// dest:dropPort are received by localhost:acceptPort.
+func loopbackTest(dest net.IP, args ...string) error {
+	if err := natTable(args...); err != nil {
+		return err
+	}
+	sendCh := make(chan error)
+	listenCh := make(chan error)
+	go func() {
+		sendCh <- sendUDPLoop(dest, dropPort, sendloopDuration)
+	}()
+	go func() {
+		listenCh <- listenUDP(acceptPort, sendloopDuration)
+	}()
+	select {
+	case err := <-listenCh:
+		if err != nil {
+			return err
+		}
+	case <-time.After(sendloopDuration):
+		return errors.New("timed out")
+	}
+	// sendCh will always take the full sendloop time.
+	return <-sendCh
+}
diff --git a/test/packetimpact/README.md b/test/packetimpact/README.md
index a82ad996a..f46c67a0c 100644
--- a/test/packetimpact/README.md
+++ b/test/packetimpact/README.md
@@ -18,6 +18,27 @@ Packetimpact aims to provide:
 *   **Control-flow** like for loops, conditionals, and variables.
 *   **Flexibilty** to specify every byte in a packet or use multiple sockets.
 
+## How to run packetimpact tests?
+
+Build the test container image by running the following at the root of the
+repository:
+
+```bash
+$ make load-packetimpact
+```
+
+Run a test, e.g. `fin_wait2_timeout`, against Linux:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_linux_test
+```
+
+Run the same test, but against gVisor:
+
+```bash
+$ bazel test //test/packetimpact/tests:fin_wait2_timeout_netstack_test
+```
+
 ## When to use packetimpact?
 
 There are a few ways to write networking tests for gVisor currently:
diff --git a/test/packetimpact/dut/posix_server.cc b/test/packetimpact/dut/posix_server.cc
index dc3024f44..a1a5c3612 100644
--- a/test/packetimpact/dut/posix_server.cc
+++ b/test/packetimpact/dut/posix_server.cc
@@ -61,7 +61,7 @@
 }
 
 ::grpc::Status proto_to_sockaddr(const posix_server::Sockaddr &sockaddr_proto,
-                                 sockaddr_storage *addr) {
+                                 sockaddr_storage *addr, socklen_t *addr_len) {
   switch (sockaddr_proto.sockaddr_case()) {
     case posix_server::Sockaddr::SockaddrCase::kIn: {
       auto proto_in = sockaddr_proto.in();
@@ -74,6 +74,7 @@
       addr_in->sin_port = htons(proto_in.port());
       proto_in.addr().copy(reinterpret_cast<char *>(&addr_in->sin_addr.s_addr),
                            4);
+      *addr_len = sizeof(*addr_in);
       break;
     }
     case posix_server::Sockaddr::SockaddrCase::kIn6: {
@@ -89,6 +90,7 @@
       proto_in6.addr().copy(
           reinterpret_cast<char *>(&addr_in6->sin6_addr.s6_addr), 16);
       addr_in6->sin6_scope_id = htonl(proto_in6.scope_id());
+      *addr_len = sizeof(*addr_in6);
       break;
     }
     case posix_server::Sockaddr::SockaddrCase::SOCKADDR_NOT_SET:
@@ -120,13 +122,14 @@ class PosixImpl final : public posix_server::Posix::Service {
     }
 
     sockaddr_storage addr;
-    auto err = proto_to_sockaddr(request->addr(), &addr);
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->addr(), &addr, &addr_len);
     if (!err.ok()) {
       return err;
     }
 
-    response->set_ret(bind(request->sockfd(),
-                           reinterpret_cast<sockaddr *>(&addr), sizeof(addr)));
+    response->set_ret(
+        bind(request->sockfd(), reinterpret_cast<sockaddr *>(&addr), addr_len));
     response->set_errno_(errno);
     return ::grpc::Status::OK;
   }
@@ -147,13 +150,22 @@ class PosixImpl final : public posix_server::Posix::Service {
                             "Missing address");
     }
     sockaddr_storage addr;
-    auto err = proto_to_sockaddr(request->addr(), &addr);
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->addr(), &addr, &addr_len);
     if (!err.ok()) {
       return err;
     }
 
-    response->set_ret(connect(
-        request->sockfd(), reinterpret_cast<sockaddr *>(&addr), sizeof(addr)));
+    response->set_ret(connect(request->sockfd(),
+                              reinterpret_cast<sockaddr *>(&addr), addr_len));
+    response->set_errno_(errno);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Fcntl(grpc_impl::ServerContext *context,
+                       const ::posix_server::FcntlRequest *request,
+                       ::posix_server::FcntlResponse *response) override {
+    response->set_ret(::fcntl(request->fd(), request->cmd(), request->arg()));
     response->set_errno_(errno);
     return ::grpc::Status::OK;
   }
@@ -237,14 +249,15 @@ class PosixImpl final : public posix_server::Posix::Service {
                             "Missing address");
     }
     sockaddr_storage addr;
-    auto err = proto_to_sockaddr(request->dest_addr(), &addr);
+    socklen_t addr_len;
+    auto err = proto_to_sockaddr(request->dest_addr(), &addr, &addr_len);
     if (!err.ok()) {
       return err;
     }
 
-    response->set_ret(::sendto(
-        request->sockfd(), request->buf().data(), request->buf().size(),
-        request->flags(), reinterpret_cast<sockaddr *>(&addr), sizeof(addr)));
+    response->set_ret(::sendto(request->sockfd(), request->buf().data(),
+                               request->buf().size(), request->flags(),
+                               reinterpret_cast<sockaddr *>(&addr), addr_len));
     response->set_errno_(errno);
     return ::grpc::Status::OK;
   }
diff --git a/test/packetimpact/netdevs/BUILD b/test/packetimpact/netdevs/BUILD
new file mode 100644
index 000000000..422bb9b0c
--- /dev/null
+++ b/test/packetimpact/netdevs/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "netdevs",
+    srcs = ["netdevs.go"],
+    visibility = ["//test/packetimpact:__subpackages__"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+    ],
+)
diff --git a/test/packetimpact/netdevs/netdevs.go b/test/packetimpact/netdevs/netdevs.go
new file mode 100644
index 000000000..d2c9cfeaf
--- /dev/null
+++ b/test/packetimpact/netdevs/netdevs.go
@@ -0,0 +1,104 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netdevs contains utilities for working with network devices.
+package netdevs
+
+import (
+	"fmt"
+	"net"
+	"regexp"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// A DeviceInfo represents a network device.
+type DeviceInfo struct {
+	MAC      net.HardwareAddr
+	IPv4Addr net.IP
+	IPv4Net  *net.IPNet
+	IPv6Addr net.IP
+	IPv6Net  *net.IPNet
+}
+
+var (
+	deviceLine = regexp.MustCompile(`^\s*\d+: (\w+)`)
+	linkLine   = regexp.MustCompile(`^\s*link/\w+ ([0-9a-fA-F:]+)`)
+	inetLine   = regexp.MustCompile(`^\s*inet ([0-9./]+)`)
+	inet6Line  = regexp.MustCompile(`^\s*inet6 ([0-9a-fA-Z:/]+)`)
+)
+
+// ParseDevices parses the output from `ip addr show` into a map from device
+// name to information about the device.
+func ParseDevices(cmdOutput string) (map[string]DeviceInfo, error) {
+	var currentDevice string
+	var currentInfo DeviceInfo
+	deviceInfos := make(map[string]DeviceInfo)
+	for _, line := range strings.Split(cmdOutput, "\n") {
+		if m := deviceLine.FindStringSubmatch(line); m != nil {
+			if currentDevice != "" {
+				deviceInfos[currentDevice] = currentInfo
+			}
+			currentInfo = DeviceInfo{}
+			currentDevice = m[1]
+		} else if m := linkLine.FindStringSubmatch(line); m != nil {
+			mac, err := net.ParseMAC(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.MAC = mac
+		} else if m := inetLine.FindStringSubmatch(line); m != nil {
+			ipv4Addr, ipv4Net, err := net.ParseCIDR(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.IPv4Addr = ipv4Addr
+			currentInfo.IPv4Net = ipv4Net
+		} else if m := inet6Line.FindStringSubmatch(line); m != nil {
+			ipv6Addr, ipv6Net, err := net.ParseCIDR(m[1])
+			if err != nil {
+				return nil, err
+			}
+			currentInfo.IPv6Addr = ipv6Addr
+			currentInfo.IPv6Net = ipv6Net
+		}
+	}
+	if currentDevice != "" {
+		deviceInfos[currentDevice] = currentInfo
+	}
+	return deviceInfos, nil
+}
+
+// MACToIP converts the MAC address to an IPv6 link local address as described
+// in RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
+func MACToIP(mac net.HardwareAddr) net.IP {
+	addr := make([]byte, header.IPv6AddressSize)
+	addr[0] = 0xfe
+	addr[1] = 0x80
+	header.EthernetAdddressToModifiedEUI64IntoBuf(tcpip.LinkAddress(mac), addr[8:])
+	return net.IP(addr)
+}
+
+// FindDeviceByIP finds a DeviceInfo and device name from an IP address in the
+// output of ParseDevices.
+func FindDeviceByIP(ip net.IP, devices map[string]DeviceInfo) (string, DeviceInfo, error) {
+	for dev, info := range devices {
+		if info.IPv4Addr.Equal(ip) {
+			return dev, info, nil
+		}
+	}
+	return "", DeviceInfo{}, fmt.Errorf("can't find %s on any interface", ip)
+}
diff --git a/test/packetimpact/proto/posix_server.proto b/test/packetimpact/proto/posix_server.proto
index 9dca563f1..77da0fb3a 100644
--- a/test/packetimpact/proto/posix_server.proto
+++ b/test/packetimpact/proto/posix_server.proto
@@ -91,6 +91,17 @@ message ConnectResponse {
   int32 errno_ = 2;  // "errno" may fail to compile in c++.
 }
 
+message FcntlRequest {
+  int32 fd = 1;
+  int32 cmd = 2;
+  int32 arg = 3;
+}
+
+message FcntlResponse {
+  int32 ret = 1;
+  int32 errno_ = 2;
+}
+
 message GetSockNameRequest {
   int32 sockfd = 1;
 }
@@ -198,6 +209,8 @@ service Posix {
   rpc Close(CloseRequest) returns (CloseResponse);
   // Call connect() on the DUT.
   rpc Connect(ConnectRequest) returns (ConnectResponse);
+  // Call fcntl() on the DUT.
+  rpc Fcntl(FcntlRequest) returns (FcntlResponse);
   // Call getsockname() on the DUT.
   rpc GetSockName(GetSockNameRequest) returns (GetSockNameResponse);
   // Call getsockopt() on the DUT.
diff --git a/test/packetimpact/runner/BUILD b/test/packetimpact/runner/BUILD
new file mode 100644
index 000000000..0b68a760a
--- /dev/null
+++ b/test/packetimpact/runner/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_test")
+
+package(
+    default_visibility = ["//test/packetimpact:__subpackages__"],
+    licenses = ["notice"],
+)
+
+go_test(
+    name = "packetimpact_test",
+    srcs = ["packetimpact_test.go"],
+    tags = [
+        # Not intended to be run directly.
+        "local",
+        "manual",
+    ],
+    deps = [
+        "//pkg/test/dockerutil",
+        "//test/packetimpact/netdevs",
+    ],
+)
diff --git a/test/packetimpact/tests/defs.bzl b/test/packetimpact/runner/defs.bzl
index 45dce64ab..ea66b9756 100644
--- a/test/packetimpact/tests/defs.bzl
+++ b/test/packetimpact/runner/defs.bzl
@@ -11,12 +11,10 @@ def _packetimpact_test_impl(ctx):
         # permission problems, because all runfiles may not be owned by the
         # current user, and no other users will be mapped in that namespace.
         # Make sure that everything is readable here.
-        "find . -type f -exec chmod a+rx {} \\;",
-        "find . -type d -exec chmod a+rx {} \\;",
-        "%s %s --posix_server_binary %s --testbench_binary %s $@\n" % (
+        "find . -type f -or -type d -exec chmod a+rx {} \\;",
+        "%s %s --testbench_binary %s $@\n" % (
             test_runner.short_path,
             " ".join(ctx.attr.flags),
-            ctx.files._posix_server_binary[0].short_path,
             ctx.files.testbench_binary[0].short_path,
         ),
     ])
@@ -38,7 +36,7 @@ _packetimpact_test = rule(
         "_test_runner": attr.label(
             executable = True,
             cfg = "target",
-            default = ":test_runner",
+            default = ":packetimpact_test",
         ),
         "_posix_server_binary": attr.label(
             cfg = "target",
@@ -69,6 +67,7 @@ def packetimpact_linux_test(
     Args:
         name: name of the test
         testbench_binary: the testbench binary
+        expect_failure: the test must fail
         **kwargs: all the other args, forwarded to _packetimpact_test
     """
     expect_failure_flag = ["--expect_failure"] if expect_failure else []
@@ -113,8 +112,8 @@ def packetimpact_go_test(name, size = "small", pure = True, expect_linux_failure
         name: name of the test
         size: size of the test
         pure: make a static go binary
-        expect_linux_failure: expect the test to fail for Linux
-        expect_netstack_failure: expect the test to fail for Netstack
+        expect_linux_failure: the test must fail for Linux
+        expect_netstack_failure: the test must fail for Netstack
         **kwargs: all the other args, forwarded to go_test
     """
     testbench_binary = name + "_test"
diff --git a/test/packetimpact/runner/packetimpact_test.go b/test/packetimpact/runner/packetimpact_test.go
new file mode 100644
index 000000000..e58a1fb1b
--- /dev/null
+++ b/test/packetimpact/runner/packetimpact_test.go
@@ -0,0 +1,332 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The runner starts docker containers and networking for a packetimpact test.
+package packetimpact_test
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"os/exec"
+	"path"
+	"strings"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/test/dockerutil"
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
+)
+
+// stringList implements flag.Value.
+type stringList []string
+
+// String implements flag.Value.String.
+func (l *stringList) String() string {
+	return strings.Join(*l, ",")
+}
+
+// Set implements flag.Value.Set.
+func (l *stringList) Set(value string) error {
+	*l = append(*l, value)
+	return nil
+}
+
+var (
+	dutPlatform     = flag.String("dut_platform", "", "either \"linux\" or \"netstack\"")
+	testbenchBinary = flag.String("testbench_binary", "", "path to the testbench binary")
+	tshark          = flag.Bool("tshark", false, "use more verbose tshark in logs instead of tcpdump")
+	extraTestArgs   = stringList{}
+	expectFailure   = flag.Bool("expect_failure", false, "expect that the test will fail when run")
+
+	dutAddr       = net.IPv4(0, 0, 0, 10)
+	testbenchAddr = net.IPv4(0, 0, 0, 20)
+)
+
+const ctrlPort = "40000"
+
+// logger implements testutil.Logger.
+//
+// Labels logs based on their source and formats multi-line logs.
+type logger string
+
+// Name implements testutil.Logger.Name.
+func (l logger) Name() string {
+	return string(l)
+}
+
+// Logf implements testutil.Logger.Logf.
+func (l logger) Logf(format string, args ...interface{}) {
+	lines := strings.Split(fmt.Sprintf(format, args...), "\n")
+	log.Printf("%s: %s", l, lines[0])
+	for _, line := range lines[1:] {
+		log.Printf("%*s  %s", len(l), "", line)
+	}
+}
+
+func TestOne(t *testing.T) {
+	flag.Var(&extraTestArgs, "extra_test_arg", "extra arguments to pass to the testbench")
+	flag.Parse()
+	if *dutPlatform != "linux" && *dutPlatform != "netstack" {
+		t.Fatal("--dut_platform should be either linux or netstack")
+	}
+	if *testbenchBinary == "" {
+		t.Fatal("--testbench_binary is missing")
+	}
+	if *dutPlatform == "netstack" {
+		if _, err := dockerutil.RuntimePath(); err != nil {
+			t.Fatal("--runtime is missing or invalid with --dut_platform=netstack:", err)
+		}
+	}
+	dockerutil.EnsureSupportedDockerVersion()
+
+	// Create the networks needed for the test. One control network is needed for
+	// the gRPC control packets and one test network on which to transmit the test
+	// packets.
+	ctrlNet := dockerutil.NewDockerNetwork(logger("ctrlNet"))
+	testNet := dockerutil.NewDockerNetwork(logger("testNet"))
+	for _, dn := range []*dockerutil.DockerNetwork{ctrlNet, testNet} {
+		for {
+			if err := createDockerNetwork(dn); err != nil {
+				t.Log("creating docker network:", err)
+				const wait = 100 * time.Millisecond
+				t.Logf("sleeping %s and will try creating docker network again", wait)
+				// This can fail if another docker network claimed the same IP so we'll
+				// just try again.
+				time.Sleep(wait)
+				continue
+			}
+			break
+		}
+		defer func(dn *dockerutil.DockerNetwork) {
+			if err := dn.Cleanup(); err != nil {
+				t.Errorf("unable to cleanup container %s: %s", dn.Name, err)
+			}
+		}(dn)
+	}
+
+	tmpDir, err := ioutil.TempDir("", "container-output")
+	if err != nil {
+		t.Fatal("creating temp dir:", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
+	const testOutputDir = "/tmp/testoutput"
+
+	runOpts := dockerutil.RunOpts{
+		Image:      "packetimpact",
+		CapAdd:     []string{"NET_ADMIN"},
+		Extra:      []string{"--sysctl", "net.ipv6.conf.all.disable_ipv6=0", "--rm", "-v", tmpDir + ":" + testOutputDir},
+		Foreground: true,
+	}
+
+	// Create the Docker container for the DUT.
+	dut := dockerutil.MakeDocker(logger("dut"))
+	if *dutPlatform == "linux" {
+		dut.Runtime = ""
+	}
+
+	const containerPosixServerBinary = "/packetimpact/posix_server"
+	dut.CopyFiles("/packetimpact", "/test/packetimpact/dut/posix_server")
+
+	if err := dut.Create(runOpts, containerPosixServerBinary, "--ip=0.0.0.0", "--port="+ctrlPort); err != nil {
+		t.Fatalf("unable to create container %s: %s", dut.Name, err)
+	}
+	defer dut.CleanUp()
+
+	// Add ctrlNet as eth1 and testNet as eth2.
+	const testNetDev = "eth2"
+	if err := addNetworks(dut, dutAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := dut.Start(); err != nil {
+		t.Fatalf("unable to start container %s: %s", dut.Name, err)
+	}
+
+	if _, err := dut.WaitForOutput("Server listening.*\n", 60*time.Second); err != nil {
+		t.Fatalf("%s on container %s never listened: %s", containerPosixServerBinary, dut.Name, err)
+	}
+
+	dutTestDevice, dutDeviceInfo, err := deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	remoteMAC := dutDeviceInfo.MAC
+	remoteIPv6 := dutDeviceInfo.IPv6Addr
+	// Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
+	// needed.
+	if remoteIPv6 == nil {
+		if _, err := dut.Exec(dockerutil.RunOpts{}, "ip", "addr", "add", netdevs.MACToIP(remoteMAC).String(), "scope", "link", "dev", dutTestDevice); err != nil {
+			t.Fatalf("unable to ip addr add on container %s: %s", dut.Name, err)
+		}
+		// Now try again, to make sure that it worked.
+		_, dutDeviceInfo, err = deviceByIP(dut, addressInSubnet(dutAddr, *testNet.Subnet))
+		if err != nil {
+			t.Fatal(err)
+		}
+		remoteIPv6 = dutDeviceInfo.IPv6Addr
+		if remoteIPv6 == nil {
+			t.Fatal("unable to set IPv6 address on container", dut.Name)
+		}
+	}
+
+	// Create the Docker container for the testbench.
+	testbench := dockerutil.MakeDocker(logger("testbench"))
+	testbench.Runtime = "" // The testbench always runs on Linux.
+
+	tbb := path.Base(*testbenchBinary)
+	containerTestbenchBinary := "/packetimpact/" + tbb
+	testbench.CopyFiles("/packetimpact", "/test/packetimpact/tests/"+tbb)
+
+	// Run tcpdump in the test bench unbuffered, without DNS resolution, just on
+	// the interface with the test packets.
+	snifferArgs := []string{
+		"tcpdump",
+		"-S", "-vvv", "-U", "-n",
+		"-i", testNetDev,
+		"-w", testOutputDir + "/dump.pcap",
+	}
+	snifferRegex := "tcpdump: listening.*\n"
+	if *tshark {
+		// Run tshark in the test bench unbuffered, without DNS resolution, just on
+		// the interface with the test packets.
+		snifferArgs = []string{
+			"tshark", "-V", "-l", "-n", "-i", testNetDev,
+			"-o", "tcp.check_checksum:TRUE",
+			"-o", "udp.check_checksum:TRUE",
+		}
+		snifferRegex = "Capturing on.*\n"
+	}
+
+	defer func() {
+		if err := exec.Command("/bin/cp", "-r", tmpDir, os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR")).Run(); err != nil {
+			t.Error("unable to copy container output files:", err)
+		}
+	}()
+
+	if err := testbench.Create(runOpts, snifferArgs...); err != nil {
+		t.Fatalf("unable to create container %s: %s", testbench.Name, err)
+	}
+	defer testbench.CleanUp()
+
+	// Add ctrlNet as eth1 and testNet as eth2.
+	if err := addNetworks(testbench, testbenchAddr, []*dockerutil.DockerNetwork{ctrlNet, testNet}); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := testbench.Start(); err != nil {
+		t.Fatalf("unable to start container %s: %s", testbench.Name, err)
+	}
+
+	// Kill so that it will flush output.
+	defer testbench.Exec(dockerutil.RunOpts{}, "killall", snifferArgs[0])
+
+	if _, err := testbench.WaitForOutput(snifferRegex, 60*time.Second); err != nil {
+		t.Fatalf("sniffer on %s never listened: %s", dut.Name, err)
+	}
+
+	// Because the Linux kernel receives the SYN-ACK but didn't send the SYN it
+	// will issue a RST. To prevent this IPtables can be used to filter out all
+	// incoming packets. The raw socket that packetimpact tests use will still see
+	// everything.
+	if _, err := testbench.Exec(dockerutil.RunOpts{}, "iptables", "-A", "INPUT", "-i", testNetDev, "-j", "DROP"); err != nil {
+		t.Fatalf("unable to Exec iptables on container %s: %s", testbench.Name, err)
+	}
+
+	// FIXME(b/156449515): Some piece of the system has a race. The old
+	// bash script version had a sleep, so we have one too. The race should
+	// be fixed and this sleep removed.
+	time.Sleep(time.Second)
+
+	// Start a packetimpact test on the test bench. The packetimpact test sends
+	// and receives packets and also sends POSIX socket commands to the
+	// posix_server to be executed on the DUT.
+	testArgs := []string{containerTestbenchBinary}
+	testArgs = append(testArgs, extraTestArgs...)
+	testArgs = append(testArgs,
+		"--posix_server_ip", addressInSubnet(dutAddr, *ctrlNet.Subnet).String(),
+		"--posix_server_port", ctrlPort,
+		"--remote_ipv4", addressInSubnet(dutAddr, *testNet.Subnet).String(),
+		"--local_ipv4", addressInSubnet(testbenchAddr, *testNet.Subnet).String(),
+		"--remote_ipv6", remoteIPv6.String(),
+		"--remote_mac", remoteMAC.String(),
+		"--device", testNetDev,
+	)
+	_, err = testbench.Exec(dockerutil.RunOpts{}, testArgs...)
+	if !*expectFailure && err != nil {
+		t.Fatal("test failed:", err)
+	}
+	if *expectFailure && err == nil {
+		t.Fatal("test failure expected but the test succeeded, enable the test and mark the corresponding bug as fixed")
+	}
+}
+
+func addNetworks(d *dockerutil.Docker, addr net.IP, networks []*dockerutil.DockerNetwork) error {
+	for _, dn := range networks {
+		ip := addressInSubnet(addr, *dn.Subnet)
+		// Connect to the network with the specified IP address.
+		if err := dn.Connect(d, "--ip", ip.String()); err != nil {
+			return fmt.Errorf("unable to connect container %s to network %s: %w", d.Name, dn.Name, err)
+		}
+	}
+	return nil
+}
+
+// addressInSubnet combines the subnet provided with the address and returns a
+// new address. The return address bits come from the subnet where the mask is 1
+// and from the ip address where the mask is 0.
+func addressInSubnet(addr net.IP, subnet net.IPNet) net.IP {
+	var octets []byte
+	for i := 0; i < 4; i++ {
+		octets = append(octets, (subnet.IP.To4()[i]&subnet.Mask[i])+(addr.To4()[i]&(^subnet.Mask[i])))
+	}
+	return net.IP(octets)
+}
+
+// makeDockerNetwork makes a randomly-named network that will start with the
+// namePrefix. The network will be a random /24 subnet.
+func createDockerNetwork(n *dockerutil.DockerNetwork) error {
+	randSource := rand.NewSource(time.Now().UnixNano())
+	r1 := rand.New(randSource)
+	// Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
+	ip := net.IPv4(byte(r1.Intn(224-192)+192), byte(r1.Intn(256)), byte(r1.Intn(256)), 0)
+	n.Subnet = &net.IPNet{
+		IP:   ip,
+		Mask: ip.DefaultMask(),
+	}
+	return n.Create()
+}
+
+// deviceByIP finds a deviceInfo and device name from an IP address.
+func deviceByIP(d *dockerutil.Docker, ip net.IP) (string, netdevs.DeviceInfo, error) {
+	out, err := d.Exec(dockerutil.RunOpts{}, "ip", "addr", "show")
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("listing devices on %s container: %w", d.Name, err)
+	}
+	devs, err := netdevs.ParseDevices(out)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("parsing devices from %s container: %w", d.Name, err)
+	}
+	testDevice, deviceInfo, err := netdevs.FindDeviceByIP(ip, devs)
+	if err != nil {
+		return "", netdevs.DeviceInfo{}, fmt.Errorf("can't find deviceInfo for container %s: %w", d.Name, err)
+	}
+	return testDevice, deviceInfo, nil
+}
diff --git a/test/packetimpact/testbench/BUILD b/test/packetimpact/testbench/BUILD
index fed51006f..d19ec07d4 100644
--- a/test/packetimpact/testbench/BUILD
+++ b/test/packetimpact/testbench/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
         "//pkg/usermem",
+        "//test/packetimpact/netdevs",
         "//test/packetimpact/proto:posix_server_go_proto",
         "@com_github_google_go-cmp//cmp:go_default_library",
         "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
@@ -39,6 +40,7 @@ go_test(
     library = ":testbench",
     deps = [
         "//pkg/tcpip",
+        "//pkg/tcpip/header",
         "@com_github_mohae_deepcopy//:go_default_library",
     ],
 )
diff --git a/test/packetimpact/testbench/connections.go b/test/packetimpact/testbench/connections.go
index 463fd0556..8b4a4d905 100644
--- a/test/packetimpact/testbench/connections.go
+++ b/test/packetimpact/testbench/connections.go
@@ -43,16 +43,17 @@ func portFromSockaddr(sa unix.Sockaddr) (uint16, error) {
 
 // pickPort makes a new socket and returns the socket FD and port. The domain should be AF_INET or AF_INET6. The caller must close the FD when done with
 // the port if there is no error.
-func pickPort(domain, typ int) (fd int, sa unix.Sockaddr, err error) {
-	fd, err = unix.Socket(domain, typ, 0)
+func pickPort(domain, typ int) (int, uint16, error) {
+	fd, err := unix.Socket(domain, typ, 0)
 	if err != nil {
-		return -1, nil, err
+		return -1, 0, err
 	}
 	defer func() {
 		if err != nil {
 			err = multierr.Append(err, unix.Close(fd))
 		}
 	}()
+	var sa unix.Sockaddr
 	switch domain {
 	case unix.AF_INET:
 		var sa4 unix.SockaddrInet4
@@ -63,16 +64,20 @@ func pickPort(domain, typ int) (fd int, sa unix.Sockaddr, err error) {
 		copy(sa6.Addr[:], net.ParseIP(LocalIPv6).To16())
 		sa = &sa6
 	default:
-		return -1, nil, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain)
+		return -1, 0, fmt.Errorf("invalid domain %d, it should be one of unix.AF_INET or unix.AF_INET6", domain)
 	}
 	if err = unix.Bind(fd, sa); err != nil {
-		return -1, nil, err
+		return -1, 0, err
 	}
 	sa, err = unix.Getsockname(fd)
 	if err != nil {
-		return -1, nil, err
+		return -1, 0, err
 	}
-	return fd, sa, nil
+	port, err := portFromSockaddr(sa)
+	if err != nil {
+		return -1, 0, err
+	}
+	return fd, port, nil
 }
 
 // layerState stores the state of a layer of a connection.
@@ -114,12 +119,12 @@ var _ layerState = (*etherState)(nil)
 func newEtherState(out, in Ether) (*etherState, error) {
 	lMAC, err := tcpip.ParseMACAddress(LocalMAC)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("parsing local MAC: %q: %w", LocalMAC, err)
 	}
 
 	rMAC, err := tcpip.ParseMACAddress(RemoteMAC)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("parsing remote MAC: %q: %w", RemoteMAC, err)
 	}
 	s := etherState{
 		out: Ether{SrcAddr: &lMAC, DstAddr: &rMAC},
@@ -267,11 +272,7 @@ func SeqNumValue(v seqnum.Value) *seqnum.Value {
 
 // newTCPState creates a new TCPState.
 func newTCPState(domain int, out, in TCP) (*tcpState, error) {
-	portPickerFD, localAddr, err := pickPort(domain, unix.SOCK_STREAM)
-	if err != nil {
-		return nil, err
-	}
-	localPort, err := portFromSockaddr(localAddr)
+	portPickerFD, localPort, err := pickPort(domain, unix.SOCK_STREAM)
 	if err != nil {
 		return nil, err
 	}
@@ -374,14 +375,10 @@ type udpState struct {
 var _ layerState = (*udpState)(nil)
 
 // newUDPState creates a new udpState.
-func newUDPState(domain int, out, in UDP) (*udpState, unix.Sockaddr, error) {
-	portPickerFD, localAddr, err := pickPort(domain, unix.SOCK_DGRAM)
+func newUDPState(domain int, out, in UDP) (*udpState, error) {
+	portPickerFD, localPort, err := pickPort(domain, unix.SOCK_DGRAM)
 	if err != nil {
-		return nil, nil, err
-	}
-	localPort, err := portFromSockaddr(localAddr)
-	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	s := udpState{
 		out:          UDP{SrcPort: &localPort},
@@ -389,12 +386,12 @@ func newUDPState(domain int, out, in UDP) (*udpState, unix.Sockaddr, error) {
 		portPickerFD: portPickerFD,
 	}
 	if err := s.out.merge(&out); err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	if err := s.in.merge(&in); err != nil {
-		return nil, nil, err
+		return nil, err
 	}
-	return &s, localAddr, nil
+	return &s, nil
 }
 
 func (s *udpState) outgoing() Layer {
@@ -429,7 +426,6 @@ type Connection struct {
 	layerStates []layerState
 	injector    Injector
 	sniffer     Sniffer
-	localAddr   unix.Sockaddr
 	t           *testing.T
 }
 
@@ -475,20 +471,45 @@ func (conn *Connection) Close() {
 	}
 }
 
-// CreateFrame builds a frame for the connection with layer overriding defaults
-// of the innermost layer and additionalLayers added after it.
-func (conn *Connection) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
+// CreateFrame builds a frame for the connection with defaults overriden
+// from the innermost layer out, and additionalLayers added after it.
+//
+// Note that overrideLayers can have a length that is less than the number
+// of layers in this connection, and in such cases the innermost layers are
+// overriden first. As an example, valid values of overrideLayers for a TCP-
+// over-IPv4-over-Ethernet connection are: nil, [TCP], [IPv4, TCP], and
+// [Ethernet, IPv4, TCP].
+func (conn *Connection) CreateFrame(overrideLayers Layers, additionalLayers ...Layer) Layers {
 	var layersToSend Layers
-	for _, s := range conn.layerStates {
-		layersToSend = append(layersToSend, s.outgoing())
-	}
-	if err := layersToSend[len(layersToSend)-1].merge(layer); err != nil {
-		conn.t.Fatalf("can't merge %+v into %+v: %s", layer, layersToSend[len(layersToSend)-1], err)
+	for i, s := range conn.layerStates {
+		layer := s.outgoing()
+		// overrideLayers and conn.layerStates have their tails aligned, so
+		// to find the index we move backwards by the distance i is to the
+		// end.
+		if j := len(overrideLayers) - (len(conn.layerStates) - i); j >= 0 {
+			if err := layer.merge(overrideLayers[j]); err != nil {
+				conn.t.Fatalf("can't merge %+v into %+v: %s", layer, overrideLayers[j], err)
+			}
+		}
+		layersToSend = append(layersToSend, layer)
 	}
 	layersToSend = append(layersToSend, additionalLayers...)
 	return layersToSend
 }
 
+// SendFrameStateless sends a frame without updating any of the layer states.
+//
+// This method is useful for sending out-of-band control messages such as
+// ICMP packets, where it would not make sense to update the transport layer's
+// state using the ICMP header.
+func (conn *Connection) SendFrameStateless(frame Layers) {
+	outBytes, err := frame.ToBytes()
+	if err != nil {
+		conn.t.Fatalf("can't build outgoing packet: %s", err)
+	}
+	conn.injector.Send(outBytes)
+}
+
 // SendFrame sends a frame on the wire and updates the state of all layers.
 func (conn *Connection) SendFrame(frame Layers) {
 	outBytes, err := frame.ToBytes()
@@ -509,10 +530,13 @@ func (conn *Connection) SendFrame(frame Layers) {
 	}
 }
 
-// Send a packet with reasonable defaults. Potentially override the final layer
-// in the connection with the provided layer and add additionLayers.
-func (conn *Connection) Send(layer Layer, additionalLayers ...Layer) {
-	conn.SendFrame(conn.CreateFrame(layer, additionalLayers...))
+// send sends a packet, possibly with layers of this connection overridden and
+// additional layers added.
+//
+// Types defined with Connection as the underlying type should expose
+// type-safe versions of this method.
+func (conn *Connection) send(overrideLayers Layers, additionalLayers ...Layer) {
+	conn.SendFrame(conn.CreateFrame(overrideLayers, additionalLayers...))
 }
 
 // recvFrame gets the next successfully parsed frame (of type Layers) within the
@@ -627,15 +651,36 @@ func NewTCPIPv4(t *testing.T, outgoingTCP, incomingTCP TCP) TCPIPv4 {
 	}
 }
 
-// Handshake performs a TCP 3-way handshake. The input Connection should have a
+// Connect performs a TCP 3-way handshake. The input Connection should have a
 // final TCP Layer.
-func (conn *TCPIPv4) Handshake() {
+func (conn *TCPIPv4) Connect() {
+	conn.t.Helper()
+
 	// Send the SYN.
 	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn)})
 
 	// Wait for the SYN-ACK.
 	synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
-	if synAck == nil {
+	if err != nil {
+		conn.t.Fatalf("didn't get synack during handshake: %s", err)
+	}
+	conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck
+
+	// Send an ACK.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagAck)})
+}
+
+// ConnectWithOptions performs a TCP 3-way handshake with given TCP options.
+// The input Connection should have a final TCP Layer.
+func (conn *TCPIPv4) ConnectWithOptions(options []byte) {
+	conn.t.Helper()
+
+	// Send the SYN.
+	conn.Send(TCP{Flags: Uint8(header.TCPFlagSyn), Options: options})
+
+	// Wait for the SYN-ACK.
+	synAck, err := conn.Expect(TCP{Flags: Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if err != nil {
 		conn.t.Fatalf("didn't get synack during handshake: %s", err)
 	}
 	conn.layerStates[len(conn.layerStates)-1].(*tcpState).synAck = synAck
@@ -655,10 +700,35 @@ func (conn *TCPIPv4) ExpectData(tcp *TCP, payload *Payload, timeout time.Duratio
 	return (*Connection)(conn).ExpectFrame(expected, timeout)
 }
 
+// ExpectNextData attempts to receive the next incoming segment for the
+// connection and expects that to match the given layers.
+//
+// It differs from ExpectData() in that here we are only interested in the next
+// received segment, while ExpectData() can receive multiple segments for the
+// connection until there is a match with given layers or a timeout.
+func (conn *TCPIPv4) ExpectNextData(tcp *TCP, payload *Payload, timeout time.Duration) (Layers, error) {
+	// Receive the first incoming TCP segment for this connection.
+	got, err := conn.ExpectData(&TCP{}, nil, timeout)
+	if err != nil {
+		return nil, err
+	}
+
+	expected := make([]Layer, len(conn.layerStates))
+	expected[len(expected)-1] = tcp
+	if payload != nil {
+		expected = append(expected, payload)
+		tcp.SeqNum = Uint32(uint32(*conn.RemoteSeqNum()) - uint32(payload.Length()))
+	}
+	if !(*Connection)(conn).match(expected, got) {
+		return nil, fmt.Errorf("next frame is not matching %s during %s: got %s", expected, timeout, got)
+	}
+	return got, nil
+}
+
 // Send a packet with reasonable defaults. Potentially override the TCP layer in
 // the connection with the provided layer and add additionLayers.
 func (conn *TCPIPv4) Send(tcp TCP, additionalLayers ...Layer) {
-	(*Connection)(conn).Send(&tcp, additionalLayers...)
+	(*Connection)(conn).send(Layers{&tcp}, additionalLayers...)
 }
 
 // Close frees associated resources held by the TCPIPv4 connection.
@@ -680,27 +750,48 @@ func (conn *TCPIPv4) Expect(tcp TCP, timeout time.Duration) (*TCP, error) {
 	return gotTCP, err
 }
 
-func (conn *TCPIPv4) state() *tcpState {
-	state, ok := conn.layerStates[len(conn.layerStates)-1].(*tcpState)
+func (conn *TCPIPv4) tcpState() *tcpState {
+	state, ok := conn.layerStates[2].(*tcpState)
+	if !ok {
+		conn.t.Fatalf("got transport-layer state type=%T, expected tcpState", conn.layerStates[2])
+	}
+	return state
+}
+
+func (conn *TCPIPv4) ipv4State() *ipv4State {
+	state, ok := conn.layerStates[1].(*ipv4State)
 	if !ok {
-		conn.t.Fatalf("expected final state of %v to be tcpState", conn.layerStates)
+		conn.t.Fatalf("expected network-layer state type=%T, expected ipv4State", conn.layerStates[1])
 	}
 	return state
 }
 
 // RemoteSeqNum returns the next expected sequence number from the DUT.
 func (conn *TCPIPv4) RemoteSeqNum() *seqnum.Value {
-	return conn.state().remoteSeqNum
+	return conn.tcpState().remoteSeqNum
 }
 
 // LocalSeqNum returns the next sequence number to send from the testbench.
 func (conn *TCPIPv4) LocalSeqNum() *seqnum.Value {
-	return conn.state().localSeqNum
+	return conn.tcpState().localSeqNum
 }
 
 // SynAck returns the SynAck that was part of the handshake.
 func (conn *TCPIPv4) SynAck() *TCP {
-	return conn.state().synAck
+	return conn.tcpState().synAck
+}
+
+// LocalAddr gets the local socket address of this connection.
+func (conn *TCPIPv4) LocalAddr() *unix.SockaddrInet4 {
+	sa := &unix.SockaddrInet4{Port: int(*conn.tcpState().out.SrcPort)}
+	copy(sa.Addr[:], *conn.ipv4State().out.SrcAddr)
+	return sa
+}
+
+// Drain drains the sniffer's receive buffer by receiving packets until there's
+// nothing else to receive.
+func (conn *TCPIPv4) Drain() {
+	conn.sniffer.Drain()
 }
 
 // IPv6Conn maintains the state for all the layers in a IPv6 connection.
@@ -734,15 +825,10 @@ func NewIPv6Conn(t *testing.T, outgoingIPv6, incomingIPv6 IPv6) IPv6Conn {
 	}
 }
 
-// SendFrame sends a frame on the wire and updates the state of all layers.
-func (conn *IPv6Conn) SendFrame(frame Layers) {
-	(*Connection)(conn).SendFrame(frame)
-}
-
-// CreateFrame builds a frame for the connection with ipv6 overriding the ipv6
-// layer defaults and additionalLayers added after it.
-func (conn *IPv6Conn) CreateFrame(ipv6 IPv6, additionalLayers ...Layer) Layers {
-	return (*Connection)(conn).CreateFrame(&ipv6, additionalLayers...)
+// Send sends a frame with ipv6 overriding the IPv6 layer defaults and
+// additionalLayers added after it.
+func (conn *IPv6Conn) Send(ipv6 IPv6, additionalLayers ...Layer) {
+	(*Connection)(conn).send(Layers{&ipv6}, additionalLayers...)
 }
 
 // Close to clean up any resources held.
@@ -756,12 +842,6 @@ func (conn *IPv6Conn) ExpectFrame(frame Layers, timeout time.Duration) (Layers,
 	return (*Connection)(conn).ExpectFrame(frame, timeout)
 }
 
-// Drain drains the sniffer's receive buffer by receiving packets until there's
-// nothing else to receive.
-func (conn *TCPIPv4) Drain() {
-	conn.sniffer.Drain()
-}
-
 // UDPIPv4 maintains the state for all the layers in a UDP/IPv4 connection.
 type UDPIPv4 Connection
 
@@ -775,7 +855,7 @@ func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
 	if err != nil {
 		t.Fatalf("can't make ipv4State: %s", err)
 	}
-	udpState, localAddr, err := newUDPState(unix.AF_INET, outgoingUDP, incomingUDP)
+	udpState, err := newUDPState(unix.AF_INET, outgoingUDP, incomingUDP)
 	if err != nil {
 		t.Fatalf("can't make udpState: %s", err)
 	}
@@ -792,42 +872,43 @@ func NewUDPIPv4(t *testing.T, outgoingUDP, incomingUDP UDP) UDPIPv4 {
 		layerStates: []layerState{etherState, ipv4State, udpState},
 		injector:    injector,
 		sniffer:     sniffer,
-		localAddr:   localAddr,
 		t:           t,
 	}
 }
 
-// LocalAddr gets the local socket address of this connection.
-func (conn *UDPIPv4) LocalAddr() unix.Sockaddr {
-	return conn.localAddr
+func (conn *UDPIPv4) udpState() *udpState {
+	state, ok := conn.layerStates[2].(*udpState)
+	if !ok {
+		conn.t.Fatalf("got transport-layer state type=%T, expected udpState", conn.layerStates[2])
+	}
+	return state
 }
 
-// CreateFrame builds a frame for the connection with layer overriding defaults
-// of the innermost layer and additionalLayers added after it.
-func (conn *UDPIPv4) CreateFrame(layer Layer, additionalLayers ...Layer) Layers {
-	return (*Connection)(conn).CreateFrame(layer, additionalLayers...)
+func (conn *UDPIPv4) ipv4State() *ipv4State {
+	state, ok := conn.layerStates[1].(*ipv4State)
+	if !ok {
+		conn.t.Fatalf("got network-layer state type=%T, expected ipv4State", conn.layerStates[1])
+	}
+	return state
 }
 
-// Send a packet with reasonable defaults. Potentially override the UDP layer in
-// the connection with the provided layer and add additionLayers.
-func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) {
-	(*Connection)(conn).Send(&udp, additionalLayers...)
+// LocalAddr gets the local socket address of this connection.
+func (conn *UDPIPv4) LocalAddr() *unix.SockaddrInet4 {
+	sa := &unix.SockaddrInet4{Port: int(*conn.udpState().out.SrcPort)}
+	copy(sa.Addr[:], *conn.ipv4State().out.SrcAddr)
+	return sa
 }
 
-// SendFrame sends a frame on the wire and updates the state of all layers.
-func (conn *UDPIPv4) SendFrame(frame Layers) {
-	(*Connection)(conn).SendFrame(frame)
+// Send sends a packet with reasonable defaults, potentially overriding the UDP
+// layer and adding additionLayers.
+func (conn *UDPIPv4) Send(udp UDP, additionalLayers ...Layer) {
+	(*Connection)(conn).send(Layers{&udp}, additionalLayers...)
 }
 
-// SendIP sends a packet with additionalLayers following the IP layer in the
-// connection.
-func (conn *UDPIPv4) SendIP(additionalLayers ...Layer) {
-	var layersToSend Layers
-	for _, s := range conn.layerStates[:len(conn.layerStates)-1] {
-		layersToSend = append(layersToSend, s.outgoing())
-	}
-	layersToSend = append(layersToSend, additionalLayers...)
-	conn.SendFrame(layersToSend)
+// SendIP sends a packet with reasonable defaults, potentially overriding the
+// UDP and IPv4 headers and adding additionLayers.
+func (conn *UDPIPv4) SendIP(ip IPv4, udp UDP, additionalLayers ...Layer) {
+	(*Connection)(conn).send(Layers{&ip, &udp}, additionalLayers...)
 }
 
 // Expect expects a frame with the UDP layer matching the provided UDP within
diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go
index a78b7d7ee..2a2afecb5 100644
--- a/test/packetimpact/testbench/dut.go
+++ b/test/packetimpact/testbench/dut.go
@@ -16,6 +16,7 @@ package testbench
 
 import (
 	"context"
+	"flag"
 	"net"
 	"strconv"
 	"syscall"
@@ -37,6 +38,11 @@ type DUT struct {
 
 // NewDUT creates a new connection with the DUT over gRPC.
 func NewDUT(t *testing.T) DUT {
+	flag.Parse()
+	if err := genPseudoFlags(); err != nil {
+		t.Fatal("generating psuedo flags:", err)
+	}
+
 	posixServerAddress := POSIXServerIP + ":" + strconv.Itoa(POSIXServerPort)
 	conn, err := grpc.Dial(posixServerAddress, grpc.WithInsecure(), grpc.WithKeepaliveParams(keepalive.ClientParameters{Timeout: RPCKeepalive}))
 	if err != nil {
@@ -235,7 +241,9 @@ func (dut *DUT) Connect(fd int32, sa unix.Sockaddr) {
 	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
 	defer cancel()
 	ret, err := dut.ConnectWithErrno(ctx, fd, sa)
-	if ret != 0 {
+	// Ignore 'operation in progress' error that can be returned when the socket
+	// is non-blocking.
+	if err != syscall.Errno(unix.EINPROGRESS) && ret != 0 {
 		dut.t.Fatalf("failed to connect socket: %s", err)
 	}
 }
@@ -254,6 +262,35 @@ func (dut *DUT) ConnectWithErrno(ctx context.Context, fd int32, sa unix.Sockaddr
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
+// Fcntl calls fcntl on the DUT and causes a fatal test failure if it
+// doesn't succeed. If more control over the timeout or error handling is
+// needed, use FcntlWithErrno.
+func (dut *DUT) Fcntl(fd, cmd, arg int32) int32 {
+	dut.t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), RPCTimeout)
+	defer cancel()
+	ret, err := dut.FcntlWithErrno(ctx, fd, cmd, arg)
+	if ret == -1 {
+		dut.t.Fatalf("failed to Fcntl: ret=%d, errno=%s", ret, err)
+	}
+	return ret
+}
+
+// FcntlWithErrno calls fcntl on the DUT.
+func (dut *DUT) FcntlWithErrno(ctx context.Context, fd, cmd, arg int32) (int32, error) {
+	dut.t.Helper()
+	req := pb.FcntlRequest{
+		Fd:  fd,
+		Cmd: cmd,
+		Arg: arg,
+	}
+	resp, err := dut.posixServer.Fcntl(ctx, &req)
+	if err != nil {
+		dut.t.Fatalf("failed to call Fcntl: %s", err)
+	}
+	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
+}
+
 // GetSockName calls getsockname on the DUT and causes a fatal test failure if
 // it doesn't succeed. If more control over the timeout or error handling is
 // needed, use GetSockNameWithErrno.
@@ -470,6 +507,19 @@ func (dut *DUT) SendToWithErrno(ctx context.Context, sockfd int32, buf []byte, f
 	return resp.GetRet(), syscall.Errno(resp.GetErrno_())
 }
 
+// SetNonBlocking will set O_NONBLOCK flag for fd if nonblocking
+// is true, otherwise it will clear the flag.
+func (dut *DUT) SetNonBlocking(fd int32, nonblocking bool) {
+	dut.t.Helper()
+	flags := dut.Fcntl(fd, unix.F_GETFL, 0)
+	if nonblocking {
+		flags |= unix.O_NONBLOCK
+	} else {
+		flags &= ^unix.O_NONBLOCK
+	}
+	dut.Fcntl(fd, unix.F_SETFL, flags)
+}
+
 func (dut *DUT) setSockOpt(ctx context.Context, sockfd, level, optname int32, optval *pb.SockOptVal) (int32, error) {
 	dut.t.Helper()
 	req := pb.SetSockOptRequest{
diff --git a/test/packetimpact/testbench/layers.go b/test/packetimpact/testbench/layers.go
index 49370377d..560c4111b 100644
--- a/test/packetimpact/testbench/layers.go
+++ b/test/packetimpact/testbench/layers.go
@@ -689,6 +689,7 @@ type TCP struct {
 	WindowSize    *uint16
 	Checksum      *uint16
 	UrgentPointer *uint16
+	Options       []byte
 }
 
 func (l *TCP) String() string {
@@ -697,7 +698,7 @@ func (l *TCP) String() string {
 
 // ToBytes implements Layer.ToBytes.
 func (l *TCP) ToBytes() ([]byte, error) {
-	b := make([]byte, header.TCPMinimumSize)
+	b := make([]byte, l.length())
 	h := header.TCP(b)
 	if l.SrcPort != nil {
 		h.SetSourcePort(*l.SrcPort)
@@ -727,6 +728,8 @@ func (l *TCP) ToBytes() ([]byte, error) {
 	if l.UrgentPointer != nil {
 		h.SetUrgentPoiner(*l.UrgentPointer)
 	}
+	copy(b[header.TCPMinimumSize:], l.Options)
+	header.AddTCPOptionPadding(b[header.TCPMinimumSize:], len(l.Options))
 	if l.Checksum != nil {
 		h.SetChecksum(*l.Checksum)
 		return h, nil
@@ -811,6 +814,7 @@ func parseTCP(b []byte) (Layer, layerParser) {
 		WindowSize:    Uint16(h.WindowSize()),
 		Checksum:      Uint16(h.Checksum()),
 		UrgentPointer: Uint16(h.UrgentPointer()),
+		Options:       b[header.TCPMinimumSize:h.DataOffset()],
 	}
 	return &tcp, parsePayload
 }
@@ -821,7 +825,12 @@ func (l *TCP) match(other Layer) bool {
 
 func (l *TCP) length() int {
 	if l.DataOffset == nil {
-		return header.TCPMinimumSize
+		// TCP header including the options must end on a 32-bit
+		// boundary; the user could potentially give us a slice
+		// whose length is not a multiple of 4 bytes, so we have
+		// to do the alignment here.
+		optlen := (len(l.Options) + 3) & ^3
+		return header.TCPMinimumSize + optlen
 	}
 	return int(*l.DataOffset)
 }
@@ -930,6 +939,11 @@ func (l *Payload) ToBytes() ([]byte, error) {
 	return l.Bytes, nil
 }
 
+// Length returns payload byte length.
+func (l *Payload) Length() int {
+	return l.length()
+}
+
 func (l *Payload) match(other Layer) bool {
 	return equalLayer(l, other)
 }
diff --git a/test/packetimpact/testbench/layers_test.go b/test/packetimpact/testbench/layers_test.go
index 96f72de5b..c7f00e70d 100644
--- a/test/packetimpact/testbench/layers_test.go
+++ b/test/packetimpact/testbench/layers_test.go
@@ -15,10 +15,13 @@
 package testbench
 
 import (
+	"bytes"
+	"net"
 	"testing"
 
 	"github.com/mohae/deepcopy"
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 func TestLayerMatch(t *testing.T) {
@@ -393,3 +396,112 @@ func TestLayersDiff(t *testing.T) {
 		}
 	}
 }
+
+func TestTCPOptions(t *testing.T) {
+	for _, tt := range []struct {
+		description string
+		wantBytes   []byte
+		wantLayers  Layers
+	}{
+		{
+			description: "without payload",
+			wantBytes: []byte{
+				// IPv4 Header
+				0x45, 0x00, 0x00, 0x2c, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+				0xf9, 0x77, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01,
+				// TCP Header
+				0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xf5, 0x1c, 0x00, 0x00,
+				// WindowScale Option
+				0x03, 0x03, 0x02,
+				// NOP Option
+				0x00,
+			},
+			wantLayers: []Layer{
+				&IPv4{
+					IHL:            Uint8(20),
+					TOS:            Uint8(0),
+					TotalLength:    Uint16(44),
+					ID:             Uint16(1),
+					Flags:          Uint8(0),
+					FragmentOffset: Uint16(0),
+					TTL:            Uint8(64),
+					Protocol:       Uint8(uint8(header.TCPProtocolNumber)),
+					Checksum:       Uint16(0xf977),
+					SrcAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())),
+					DstAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())),
+				},
+				&TCP{
+					SrcPort:       Uint16(12345),
+					DstPort:       Uint16(54321),
+					SeqNum:        Uint32(0),
+					AckNum:        Uint32(0),
+					Flags:         Uint8(header.TCPFlagSyn),
+					WindowSize:    Uint16(8192),
+					Checksum:      Uint16(0xf51c),
+					UrgentPointer: Uint16(0),
+					Options:       []byte{3, 3, 2, 0},
+				},
+				&Payload{Bytes: nil},
+			},
+		},
+		{
+			description: "with payload",
+			wantBytes: []byte{
+				// IPv4 header
+				0x45, 0x00, 0x00, 0x37, 0x00, 0x01, 0x00, 0x00, 0x40, 0x06,
+				0xf9, 0x6c, 0xc0, 0xa8, 0x00, 0x02, 0xc0, 0xa8, 0x00, 0x01,
+				// TCP header
+				0x30, 0x39, 0xd4, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x60, 0x02, 0x20, 0x00, 0xe5, 0x21, 0x00, 0x00,
+				// WindowScale Option
+				0x03, 0x03, 0x02,
+				// NOP Option
+				0x00,
+				// Payload: "Sample Data"
+				0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x44, 0x61, 0x74, 0x61,
+			},
+			wantLayers: []Layer{
+				&IPv4{
+					IHL:            Uint8(20),
+					TOS:            Uint8(0),
+					TotalLength:    Uint16(55),
+					ID:             Uint16(1),
+					Flags:          Uint8(0),
+					FragmentOffset: Uint16(0),
+					TTL:            Uint8(64),
+					Protocol:       Uint8(uint8(header.TCPProtocolNumber)),
+					Checksum:       Uint16(0xf96c),
+					SrcAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.2").To4())),
+					DstAddr:        Address(tcpip.Address(net.ParseIP("192.168.0.1").To4())),
+				},
+				&TCP{
+					SrcPort:       Uint16(12345),
+					DstPort:       Uint16(54321),
+					SeqNum:        Uint32(0),
+					AckNum:        Uint32(0),
+					Flags:         Uint8(header.TCPFlagSyn),
+					WindowSize:    Uint16(8192),
+					Checksum:      Uint16(0xe521),
+					UrgentPointer: Uint16(0),
+					Options:       []byte{3, 3, 2, 0},
+				},
+				&Payload{Bytes: []byte("Sample Data")},
+			},
+		},
+	} {
+		t.Run(tt.description, func(t *testing.T) {
+			layers := parse(parseIPv4, tt.wantBytes)
+			if !layers.match(tt.wantLayers) {
+				t.Fatalf("match failed with diff: %s", layers.diff(tt.wantLayers))
+			}
+			gotBytes, err := layers.ToBytes()
+			if err != nil {
+				t.Fatalf("ToBytes() failed on %s: %s", &layers, err)
+			}
+			if !bytes.Equal(tt.wantBytes, gotBytes) {
+				t.Fatalf("mismatching bytes, gotBytes: %x, wantBytes: %x", gotBytes, tt.wantBytes)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/testbench/rawsockets.go b/test/packetimpact/testbench/rawsockets.go
index 4665f60b2..278229b7e 100644
--- a/test/packetimpact/testbench/rawsockets.go
+++ b/test/packetimpact/testbench/rawsockets.go
@@ -16,7 +16,6 @@ package testbench
 
 import (
 	"encoding/binary"
-	"flag"
 	"fmt"
 	"math"
 	"net"
@@ -41,7 +40,6 @@ func htons(x uint16) uint16 {
 
 // NewSniffer creates a Sniffer connected to *device.
 func NewSniffer(t *testing.T) (Sniffer, error) {
-	flag.Parse()
 	snifferFd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, int(htons(unix.ETH_P_ALL)))
 	if err != nil {
 		return Sniffer{}, err
@@ -136,7 +134,6 @@ type Injector struct {
 
 // NewInjector creates a new injector on *device.
 func NewInjector(t *testing.T) (Injector, error) {
-	flag.Parse()
 	ifInfo, err := net.InterfaceByName(Device)
 	if err != nil {
 		return Injector{}, err
diff --git a/test/packetimpact/testbench/testbench.go b/test/packetimpact/testbench/testbench.go
index a1242b189..4de2aa1d3 100644
--- a/test/packetimpact/testbench/testbench.go
+++ b/test/packetimpact/testbench/testbench.go
@@ -16,7 +16,12 @@ package testbench
 
 import (
 	"flag"
+	"fmt"
+	"net"
+	"os/exec"
 	"time"
+
+	"gvisor.dev/gvisor/test/packetimpact/netdevs"
 )
 
 var (
@@ -55,9 +60,31 @@ func RegisterFlags(fs *flag.FlagSet) {
 	fs.DurationVar(&RPCKeepalive, "rpc_keepalive", RPCKeepalive, "gRPC keepalive")
 	fs.StringVar(&LocalIPv4, "local_ipv4", LocalIPv4, "local IPv4 address for test packets")
 	fs.StringVar(&RemoteIPv4, "remote_ipv4", RemoteIPv4, "remote IPv4 address for test packets")
-	fs.StringVar(&LocalIPv6, "local_ipv6", LocalIPv6, "local IPv6 address for test packets")
 	fs.StringVar(&RemoteIPv6, "remote_ipv6", RemoteIPv6, "remote IPv6 address for test packets")
-	fs.StringVar(&LocalMAC, "local_mac", LocalMAC, "local mac address for test packets")
 	fs.StringVar(&RemoteMAC, "remote_mac", RemoteMAC, "remote mac address for test packets")
 	fs.StringVar(&Device, "device", Device, "local device for test packets")
 }
+
+// genPseudoFlags populates flag-like global config based on real flags.
+//
+// genPseudoFlags must only be called after flag.Parse.
+func genPseudoFlags() error {
+	out, err := exec.Command("ip", "addr", "show").CombinedOutput()
+	if err != nil {
+		return fmt.Errorf("listing devices: %q: %w", string(out), err)
+	}
+	devs, err := netdevs.ParseDevices(string(out))
+	if err != nil {
+		return fmt.Errorf("parsing devices: %w", err)
+	}
+
+	_, deviceInfo, err := netdevs.FindDeviceByIP(net.ParseIP(LocalIPv4), devs)
+	if err != nil {
+		return fmt.Errorf("can't find deviceInfo: %w", err)
+	}
+
+	LocalMAC = deviceInfo.MAC.String()
+	LocalIPv6 = deviceInfo.IPv6Addr.String()
+
+	return nil
+}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index d4fcf31fa..2a41ef326 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -1,4 +1,4 @@
-load("defs.bzl", "packetimpact_go_test")
+load("//test/packetimpact/runner:defs.bzl", "packetimpact_go_test")
 
 package(
     default_visibility = ["//test/packetimpact:__subpackages__"],
@@ -16,6 +16,19 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
+    name = "ipv4_id_uniqueness",
+    srcs = ["ipv4_id_uniqueness_test.go"],
+    # TODO(b/157506701) Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
     name = "udp_recv_multicast",
     srcs = ["udp_recv_multicast_test.go"],
     # TODO(b/152813495): Fix netstack then remove the line below.
@@ -31,8 +44,6 @@ packetimpact_go_test(
 packetimpact_go_test(
     name = "udp_icmp_error_propagation",
     srcs = ["udp_icmp_error_propagation_test.go"],
-    # TODO(b/153926291): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
@@ -113,10 +124,8 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
-    name = "tcp_should_piggyback",
-    srcs = ["tcp_should_piggyback_test.go"],
-    # TODO(b/153680566): Fix netstack then remove the line below.
-    expect_netstack_failure = True,
+    name = "tcp_send_window_sizes_piggyback",
+    srcs = ["tcp_send_window_sizes_piggyback_test.go"],
     deps = [
         "//pkg/tcpip/header",
         "//test/packetimpact/testbench",
@@ -136,6 +145,19 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
+    name = "tcp_paws_mechanism",
+    srcs = ["tcp_paws_mechanism_test.go"],
+    # TODO(b/156682000): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
     name = "tcp_user_timeout",
     srcs = ["tcp_user_timeout_test.go"],
     deps = [
@@ -146,6 +168,58 @@ packetimpact_go_test(
 )
 
 packetimpact_go_test(
+    name = "tcp_queue_receive_in_syn_sent",
+    srcs = ["tcp_queue_receive_in_syn_sent_test.go"],
+    # TODO(b/157658105): Fix netstack then remove the line below.
+    expect_netstack_failure = True,
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_synsent_reset",
+    srcs = ["tcp_synsent_reset_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_synrcvd_reset",
+    srcs = ["tcp_synrcvd_reset_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_splitseg_mss",
+    srcs = ["tcp_splitseg_mss_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
+    name = "tcp_cork_mss",
+    srcs = ["tcp_cork_mss_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//test/packetimpact/testbench",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+packetimpact_go_test(
     name = "icmpv6_param_problem",
     srcs = ["icmpv6_param_problem_test.go"],
     # TODO(b/153485026): Fix netstack then remove the line below.
@@ -166,8 +240,3 @@ packetimpact_go_test(
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
-
-sh_binary(
-    name = "test_runner",
-    srcs = ["test_runner.sh"],
-)
diff --git a/test/packetimpact/tests/fin_wait2_timeout_test.go b/test/packetimpact/tests/fin_wait2_timeout_test.go
index c26ab78d9..407565078 100644
--- a/test/packetimpact/tests/fin_wait2_timeout_test.go
+++ b/test/packetimpact/tests/fin_wait2_timeout_test.go
@@ -21,11 +21,11 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func TestFinWait2Timeout(t *testing.T) {
@@ -37,13 +37,13 @@ func TestFinWait2Timeout(t *testing.T) {
 		{"WithoutLinger2", false},
 	} {
 		t.Run(tt.description, func(t *testing.T) {
-			dut := tb.NewDUT(t)
+			dut := testbench.NewDUT(t)
 			defer dut.TearDown()
 			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 			defer dut.Close(listenFd)
-			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 			defer conn.Close()
-			conn.Handshake()
+			conn.Connect()
 
 			acceptFd, _ := dut.Accept(listenFd)
 			if tt.linger2 {
@@ -52,21 +52,21 @@ func TestFinWait2Timeout(t *testing.T) {
 			}
 			dut.Close(acceptFd)
 
-			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagFin | header.TCPFlagAck)}, time.Second); err != nil {
 				t.Fatalf("expected a FIN-ACK within 1 second but got none: %s", err)
 			}
-			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 
 			time.Sleep(5 * time.Second)
 			conn.Drain()
 
-			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 			if tt.linger2 {
-				if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+				if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
 					t.Fatalf("expected a RST packet within a second but got none: %s", err)
 				}
 			} else {
-				if got, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil {
+				if got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, 10*time.Second); got != nil || err == nil {
 					t.Fatalf("expected no RST packets within ten seconds but got one: %s", got)
 				}
 			}
diff --git a/test/packetimpact/tests/icmpv6_param_problem_test.go b/test/packetimpact/tests/icmpv6_param_problem_test.go
index bb1fc26fc..4d1d9a7f5 100644
--- a/test/packetimpact/tests/icmpv6_param_problem_test.go
+++ b/test/packetimpact/tests/icmpv6_param_problem_test.go
@@ -21,32 +21,32 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestICMPv6ParamProblemTest sends a packet with a bad next header. The DUT
 // should respond with an ICMPv6 Parameter Problem message.
 func TestICMPv6ParamProblemTest(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
-	conn := tb.NewIPv6Conn(t, tb.IPv6{}, tb.IPv6{})
+	conn := testbench.NewIPv6Conn(t, testbench.IPv6{}, testbench.IPv6{})
 	defer conn.Close()
-	ipv6 := tb.IPv6{
+	ipv6 := testbench.IPv6{
 		// 254 is reserved and used for experimentation and testing. This should
 		// cause an error.
-		NextHeader: tb.Uint8(254),
+		NextHeader: testbench.Uint8(254),
 	}
-	icmpv6 := tb.ICMPv6{
-		Type:       tb.ICMPv6Type(header.ICMPv6EchoRequest),
+	icmpv6 := testbench.ICMPv6{
+		Type:       testbench.ICMPv6Type(header.ICMPv6EchoRequest),
 		NDPPayload: []byte("hello world"),
 	}
 
-	toSend := conn.CreateFrame(ipv6, &icmpv6)
-	conn.SendFrame(toSend)
+	toSend := (*testbench.Connection)(&conn).CreateFrame(testbench.Layers{&ipv6}, &icmpv6)
+	(*testbench.Connection)(&conn).SendFrame(toSend)
 
 	// Build the expected ICMPv6 payload, which includes an index to the
 	// problematic byte and also the problematic packet as described in
@@ -61,14 +61,14 @@ func TestICMPv6ParamProblemTest(t *testing.T) {
 	b := make([]byte, 4)
 	binary.BigEndian.PutUint32(b, header.IPv6NextHeaderOffset)
 	expectedPayload = append(b, expectedPayload...)
-	expectedICMPv6 := tb.ICMPv6{
-		Type:       tb.ICMPv6Type(header.ICMPv6ParamProblem),
+	expectedICMPv6 := testbench.ICMPv6{
+		Type:       testbench.ICMPv6Type(header.ICMPv6ParamProblem),
 		NDPPayload: expectedPayload,
 	}
 
-	paramProblem := tb.Layers{
-		&tb.Ether{},
-		&tb.IPv6{},
+	paramProblem := testbench.Layers{
+		&testbench.Ether{},
+		&testbench.IPv6{},
 		&expectedICMPv6,
 	}
 	timeout := time.Second
diff --git a/test/packetimpact/tests/ipv4_id_uniqueness_test.go b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
new file mode 100644
index 000000000..4efb9829c
--- /dev/null
+++ b/test/packetimpact/tests/ipv4_id_uniqueness_test.go
@@ -0,0 +1,111 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4_id_uniqueness_test
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func recvTCPSegment(conn *testbench.TCPIPv4, expect *testbench.TCP, expectPayload *testbench.Payload) (uint16, error) {
+	layers, err := conn.ExpectData(expect, expectPayload, time.Second)
+	if err != nil {
+		return 0, fmt.Errorf("failed to receive TCP segment: %s", err)
+	}
+	if len(layers) < 2 {
+		return 0, fmt.Errorf("got packet with layers: %v, expected to have at least 2 layers (link and network)", layers)
+	}
+	ipv4, ok := layers[1].(*testbench.IPv4)
+	if !ok {
+		return 0, fmt.Errorf("got network layer: %T, expected: *IPv4", layers[1])
+	}
+	if *ipv4.Flags&header.IPv4FlagDontFragment != 0 {
+		return 0, fmt.Errorf("got IPv4 DF=1, expected DF=0")
+	}
+	return *ipv4.ID, nil
+}
+
+// RFC 6864 section 4.2 states: "The IPv4 ID of non-atomic datagrams MUST NOT
+// be reused when sending a copy of an earlier non-atomic datagram."
+//
+// This test creates a TCP connection, uses the IP_MTU_DISCOVER socket option
+// to force the DF bit to be 0, and checks that a retransmitted segment has a
+// different IPv4 Identification value than the original segment.
+func TestIPv4RetransmitIdentificationUniqueness(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	conn.Connect()
+	remoteFD, _ := dut.Accept(listenFD)
+	defer dut.Close(remoteFD)
+
+	dut.SetSockOptInt(remoteFD, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+	// TODO(b/129291778) The following socket option clears the DF bit on
+	// IP packets sent over the socket, and is currently not supported by
+	// gVisor. gVisor by default sends packets with DF=0 anyway, so the
+	// socket option being not supported does not affect the operation of
+	// this test. Once the socket option is supported, the following call
+	// can be changed to simply assert success.
+	ret, errno := dut.SetSockOptIntWithErrno(context.Background(), remoteFD, unix.IPPROTO_IP, linux.IP_MTU_DISCOVER, linux.IP_PMTUDISC_DONT)
+	if ret == -1 && errno != unix.ENOTSUP {
+		t.Fatalf("failed to set IP_MTU_DISCOVER socket option to IP_PMTUDISC_DONT: %s", errno)
+	}
+
+	sampleData := []byte("Sample Data")
+	samplePayload := &testbench.Payload{Bytes: sampleData}
+
+	dut.Send(remoteFD, sampleData, 0)
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
+		t.Fatalf("failed to receive TCP segment sent for RTT calculation: %s", err)
+	}
+	// Let the DUT estimate RTO with RTT from the DATA-ACK.
+	// TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which
+	// we can skip sending this ACK.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+
+	expectTCP := &testbench.TCP{SeqNum: testbench.Uint32(uint32(*conn.RemoteSeqNum()))}
+	dut.Send(remoteFD, sampleData, 0)
+	originalID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+	if err != nil {
+		t.Fatalf("failed to receive TCP segment: %s", err)
+	}
+
+	retransmitID, err := recvTCPSegment(&conn, expectTCP, samplePayload)
+	if err != nil {
+		t.Fatalf("failed to receive retransmitted TCP segment: %s", err)
+	}
+	if originalID == retransmitID {
+		t.Fatalf("unexpectedly got retransmitted TCP segment with same IPv4 ID field=%d", originalID)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_close_wait_ack_test.go b/test/packetimpact/tests/tcp_close_wait_ack_test.go
index 70a22a2db..6e7ff41d7 100644
--- a/test/packetimpact/tests/tcp_close_wait_ack_test.go
+++ b/test/packetimpact/tests/tcp_close_wait_ack_test.go
@@ -23,17 +23,17 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func TestCloseWaitAck(t *testing.T) {
 	for _, tt := range []struct {
 		description    string
-		makeTestingTCP func(conn *tb.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) tb.TCP
+		makeTestingTCP func(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP
 		seqNumOffset   seqnum.Size
 		expectAck      bool
 	}{
@@ -45,27 +45,27 @@ func TestCloseWaitAck(t *testing.T) {
 		{"ACK", GenerateUnaccACKSegment, 2, true},
 	} {
 		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
-			dut := tb.NewDUT(t)
+			dut := testbench.NewDUT(t)
 			defer dut.TearDown()
 			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 			defer dut.Close(listenFd)
-			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 			defer conn.Close()
 
-			conn.Handshake()
+			conn.Connect()
 			acceptFd, _ := dut.Accept(listenFd)
 
 			// Send a FIN to DUT to intiate the active close
-			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
-			gotTCP, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, time.Second)
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)})
+			gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
 			if err != nil {
 				t.Fatalf("expected an ACK for our fin and DUT should enter CLOSE_WAIT: %s", err)
 			}
 			windowSize := seqnum.Size(*gotTCP.WindowSize)
 
 			// Send a segment with OTW Seq / unacc ACK and expect an ACK back
-			conn.Send(tt.makeTestingTCP(&conn, tt.seqNumOffset, windowSize), &tb.Payload{Bytes: []byte("Sample Data")})
-			gotAck, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, time.Second)
+			conn.Send(tt.makeTestingTCP(&conn, tt.seqNumOffset, windowSize), &testbench.Payload{Bytes: []byte("Sample Data")})
+			gotAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
 			if tt.expectAck && err != nil {
 				t.Fatalf("expected an ack but got none: %s", err)
 			}
@@ -75,14 +75,14 @@ func TestCloseWaitAck(t *testing.T) {
 
 			// Now let's verify DUT is indeed in CLOSE_WAIT
 			dut.Close(acceptFd)
-			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagFin)}, time.Second); err != nil {
 				t.Fatalf("expected DUT to send a FIN: %s", err)
 			}
 			// Ack the FIN from DUT
-			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 			// Send some extra data to DUT
-			conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, &tb.Payload{Bytes: []byte("Sample Data")})
-			if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: []byte("Sample Data")})
+			if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, time.Second); err != nil {
 				t.Fatalf("expected DUT to send an RST: %s", err)
 			}
 		})
@@ -92,17 +92,17 @@ func TestCloseWaitAck(t *testing.T) {
 // This generates an segment with seqnum = RCV.NXT + RCV.WND + seqNumOffset, the
 // generated segment is only acceptable when seqNumOffset is 0, otherwise an ACK
 // is expected from the receiver.
-func GenerateOTWSeqSegment(conn *tb.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) tb.TCP {
+func GenerateOTWSeqSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
 	lastAcceptable := conn.LocalSeqNum().Add(windowSize)
 	otwSeq := uint32(lastAcceptable.Add(seqNumOffset))
-	return tb.TCP{SeqNum: tb.Uint32(otwSeq), Flags: tb.Uint8(header.TCPFlagAck)}
+	return testbench.TCP{SeqNum: testbench.Uint32(otwSeq), Flags: testbench.Uint8(header.TCPFlagAck)}
 }
 
 // This generates an segment with acknum = SND.NXT + seqNumOffset, the generated
 // segment is only acceptable when seqNumOffset is 0, otherwise an ACK is
 // expected from the receiver.
-func GenerateUnaccACKSegment(conn *tb.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) tb.TCP {
+func GenerateUnaccACKSegment(conn *testbench.TCPIPv4, seqNumOffset seqnum.Size, windowSize seqnum.Size) testbench.TCP {
 	lastAcceptable := conn.RemoteSeqNum()
 	unaccAck := uint32(lastAcceptable.Add(seqNumOffset))
-	return tb.TCP{AckNum: tb.Uint32(unaccAck), Flags: tb.Uint8(header.TCPFlagAck)}
+	return testbench.TCP{AckNum: testbench.Uint32(unaccAck), Flags: testbench.Uint8(header.TCPFlagAck)}
 }
diff --git a/test/packetimpact/tests/tcp_cork_mss_test.go b/test/packetimpact/tests/tcp_cork_mss_test.go
new file mode 100644
index 000000000..fb8f48629
--- /dev/null
+++ b/test/packetimpact/tests/tcp_cork_mss_test.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_cork_mss_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPCorkMSS tests for segment coalesce and split as per MSS.
+func TestTCPCorkMSS(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	const mss = uint32(header.TCPDefaultMSS)
+	options := make([]byte, header.TCPOptionMSSLength)
+	header.EncodeMSSOption(mss, options)
+	conn.ConnectWithOptions(options)
+
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	dut.SetSockOptInt(acceptFD, unix.IPPROTO_TCP, unix.TCP_CORK, 1)
+
+	// Let the dut application send 2 small segments to be held up and coalesced
+	// until the application sends a larger segment to fill up to > MSS.
+	sampleData := []byte("Sample Data")
+	dut.Send(acceptFD, sampleData, 0)
+	dut.Send(acceptFD, sampleData, 0)
+
+	expectedData := sampleData
+	expectedData = append(expectedData, sampleData...)
+	largeData := make([]byte, mss+1)
+	expectedData = append(expectedData, largeData...)
+	dut.Send(acceptFD, largeData, 0)
+
+	// Expect the segments to be coalesced and sent and capped to MSS.
+	expectedPayload := testbench.Payload{Bytes: expectedData[:mss]}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	// Expect the coalesced segment to be split and transmitted.
+	expectedPayload = testbench.Payload{Bytes: expectedData[mss:]}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+
+	// Check for segments to *not* be held up because of TCP_CORK when
+	// the current send window is less than MSS.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(uint16(2 * len(sampleData)))})
+	dut.Send(acceptFD, sampleData, 0)
+	dut.Send(acceptFD, sampleData, 0)
+	expectedPayload = testbench.Payload{Bytes: append(sampleData, sampleData...)}
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &expectedPayload, time.Second); err != nil {
+		t.Fatalf("expected payload was not received: %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+}
diff --git a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
index 2c1ec27d3..b9b3e91d3 100644
--- a/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
+++ b/test/packetimpact/tests/tcp_noaccept_close_rst_test.go
@@ -21,22 +21,22 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func TestTcpNoAcceptCloseReset(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
-	conn.Handshake()
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	conn.Connect()
 	defer conn.Close()
 	dut.Close(listenFd)
-	if _, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil {
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}, 1*time.Second); err != nil {
 		t.Fatalf("expected a RST-ACK packet but got none: %s", err)
 	}
 }
diff --git a/test/packetimpact/tests/tcp_outside_the_window_test.go b/test/packetimpact/tests/tcp_outside_the_window_test.go
index 351df193e..ad8c74234 100644
--- a/test/packetimpact/tests/tcp_outside_the_window_test.go
+++ b/test/packetimpact/tests/tcp_outside_the_window_test.go
@@ -23,11 +23,11 @@ import (
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestTCPOutsideTheWindows tests the behavior of the DUT when packets arrive
@@ -38,7 +38,7 @@ func TestTCPOutsideTheWindow(t *testing.T) {
 	for _, tt := range []struct {
 		description  string
 		tcpFlags     uint8
-		payload      []tb.Layer
+		payload      []testbench.Layer
 		seqNumOffset seqnum.Size
 		expectACK    bool
 	}{
@@ -46,28 +46,28 @@ func TestTCPOutsideTheWindow(t *testing.T) {
 		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 0, true},
 		{"ACK", header.TCPFlagAck, nil, 0, false},
 		{"FIN", header.TCPFlagFin, nil, 0, false},
-		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 0, true},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 0, true},
 
 		{"SYN", header.TCPFlagSyn, nil, 1, true},
 		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 1, true},
 		{"ACK", header.TCPFlagAck, nil, 1, true},
 		{"FIN", header.TCPFlagFin, nil, 1, false},
-		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 1, true},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 1, true},
 
 		{"SYN", header.TCPFlagSyn, nil, 2, true},
 		{"SYNACK", header.TCPFlagSyn | header.TCPFlagAck, nil, 2, true},
 		{"ACK", header.TCPFlagAck, nil, 2, true},
 		{"FIN", header.TCPFlagFin, nil, 2, false},
-		{"Data", header.TCPFlagAck, []tb.Layer{&tb.Payload{Bytes: []byte("abc123")}}, 2, true},
+		{"Data", header.TCPFlagAck, []testbench.Layer{&testbench.Payload{Bytes: []byte("abc123")}}, 2, true},
 	} {
 		t.Run(fmt.Sprintf("%s%d", tt.description, tt.seqNumOffset), func(t *testing.T) {
-			dut := tb.NewDUT(t)
+			dut := testbench.NewDUT(t)
 			defer dut.TearDown()
 			listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 			defer dut.Close(listenFD)
-			conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 			defer conn.Close()
-			conn.Handshake()
+			conn.Connect()
 			acceptFD, _ := dut.Accept(listenFD)
 			defer dut.Close(acceptFD)
 
@@ -75,13 +75,13 @@ func TestTCPOutsideTheWindow(t *testing.T) {
 			conn.Drain()
 			// Ignore whatever incrementing that this out-of-order packet might cause
 			// to the AckNum.
-			localSeqNum := tb.Uint32(uint32(*conn.LocalSeqNum()))
-			conn.Send(tb.TCP{
-				Flags:  tb.Uint8(tt.tcpFlags),
-				SeqNum: tb.Uint32(uint32(conn.LocalSeqNum().Add(windowSize))),
+			localSeqNum := testbench.Uint32(uint32(*conn.LocalSeqNum()))
+			conn.Send(testbench.TCP{
+				Flags:  testbench.Uint8(tt.tcpFlags),
+				SeqNum: testbench.Uint32(uint32(conn.LocalSeqNum().Add(windowSize))),
 			}, tt.payload...)
 			timeout := 3 * time.Second
-			gotACK, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout)
+			gotACK, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: localSeqNum}, timeout)
 			if tt.expectACK && err != nil {
 				t.Fatalf("expected an ACK packet within %s but got none: %s", timeout, err)
 			}
diff --git a/test/packetimpact/tests/tcp_paws_mechanism_test.go b/test/packetimpact/tests/tcp_paws_mechanism_test.go
new file mode 100644
index 000000000..55db4ece6
--- /dev/null
+++ b/test/packetimpact/tests/tcp_paws_mechanism_test.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_paws_mechanism_test
+
+import (
+	"encoding/hex"
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestPAWSMechanism(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	options := make([]byte, header.TCPOptionTSLength)
+	header.EncodeTSOption(currentTS(), 0, options)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn), Options: options})
+	synAck, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("didn't get synack during handshake: %s", err)
+	}
+	parsedSynOpts := header.ParseSynOptions(synAck.Options, true)
+	if !parsedSynOpts.TS {
+		t.Fatalf("expected TSOpt from DUT, options we got:\n%s", hex.Dump(synAck.Options))
+	}
+	tsecr := parsedSynOpts.TSVal
+	header.EncodeTSOption(currentTS(), tsecr, options)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options})
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	sampleData := []byte("Sample Data")
+	sentTSVal := currentTS()
+	header.EncodeTSOption(sentTSVal, tsecr, options)
+	// 3ms here is chosen arbitrarily to make sure we have increasing timestamps
+	// every time we send one, it should not cause any flakiness because timestamps
+	// only need to be non-decreasing.
+	time.Sleep(3 * time.Millisecond)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData})
+
+	gotTCP, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("expected an ACK but got none: %s", err)
+	}
+
+	parsedOpts := header.ParseTCPOptions(gotTCP.Options)
+	if !parsedOpts.TS {
+		t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options))
+	}
+	if parsedOpts.TSVal < tsecr {
+		t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr)
+	}
+	if parsedOpts.TSEcr != sentTSVal {
+		t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal)
+	}
+	tsecr = parsedOpts.TSVal
+	lastAckNum := gotTCP.AckNum
+
+	badTSVal := sentTSVal - 100
+	header.EncodeTSOption(badTSVal, tsecr, options)
+	// 3ms here is chosen arbitrarily and this time.Sleep() should not cause flakiness
+	// due to the exact same reasoning discussed above.
+	time.Sleep(3 * time.Millisecond)
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), Options: options}, &testbench.Payload{Bytes: sampleData})
+
+	gotTCP, err = conn.Expect(testbench.TCP{AckNum: lastAckNum, Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second)
+	if err != nil {
+		t.Fatalf("expected segment with AckNum %d but got none: %s", lastAckNum, err)
+	}
+	parsedOpts = header.ParseTCPOptions(gotTCP.Options)
+	if !parsedOpts.TS {
+		t.Fatalf("expected TS option in response, options we got:\n%s", hex.Dump(gotTCP.Options))
+	}
+	if parsedOpts.TSVal < tsecr {
+		t.Fatalf("TSVal should be non-decreasing, but %d < %d", parsedOpts.TSVal, tsecr)
+	}
+	if parsedOpts.TSEcr != sentTSVal {
+		t.Fatalf("TSEcr should match our sent TSVal, %d != %d", parsedOpts.TSEcr, sentTSVal)
+	}
+}
+
+func currentTS() uint32 {
+	return uint32(time.Now().UnixNano() / 1e6)
+}
diff --git a/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go
new file mode 100644
index 000000000..b640d8673
--- /dev/null
+++ b/test/packetimpact/tests/tcp_queue_receive_in_syn_sent_test.go
@@ -0,0 +1,87 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_queue_receive_in_syn_sent_test
+
+import (
+	"bytes"
+	"context"
+	"encoding/hex"
+	"errors"
+	"flag"
+	"net"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+func TestQueueReceiveInSynSent(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+
+	socket, remotePort := dut.CreateBoundSocket(unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4))
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	sampleData := []byte("Sample Data")
+
+	dut.SetNonBlocking(socket, true)
+	if _, err := dut.ConnectWithErrno(context.Background(), socket, conn.LocalAddr()); !errors.Is(err, syscall.EINPROGRESS) {
+		t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err)
+	}
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil {
+		t.Fatalf("expected a SYN from DUT, but got none: %s", err)
+	}
+
+	// Issue RECEIVE call in SYN-SENT, this should be queued for process until the connection
+	// is established.
+	dut.SetNonBlocking(socket, false)
+	var wg sync.WaitGroup
+	defer wg.Wait()
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
+		defer cancel()
+		n, buff, err := dut.RecvWithErrno(ctx, socket, int32(len(sampleData)), 0)
+		if n == -1 {
+			t.Fatalf("failed to recv on DUT: %s", err)
+		}
+		if got := buff[:n]; !bytes.Equal(got, sampleData) {
+			t.Fatalf("received data don't match, got:\n%s, want:\n%s", hex.Dump(got), hex.Dump(sampleData))
+		}
+	}()
+
+	// The following sleep is used to prevent the connection from being established while the
+	// RPC is in flight.
+	time.Sleep(time.Second)
+
+	// Bring the connection to Established.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)})
+	if _, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+		t.Fatalf("expected an ACK from DUT, but got none: %s", err)
+	}
+
+	// Send sample data to DUT.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData})
+}
diff --git a/test/packetimpact/tests/tcp_retransmits_test.go b/test/packetimpact/tests/tcp_retransmits_test.go
index c043ad881..e51409b66 100644
--- a/test/packetimpact/tests/tcp_retransmits_test.go
+++ b/test/packetimpact/tests/tcp_retransmits_test.go
@@ -21,53 +21,53 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestRetransmits tests retransmits occur at exponentially increasing
 // time intervals.
 func TestRetransmits(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 	defer conn.Close()
 
-	conn.Handshake()
+	conn.Connect()
 	acceptFd, _ := dut.Accept(listenFd)
 	defer dut.Close(acceptFd)
 
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
-	samplePayload := &tb.Payload{Bytes: sampleData}
+	samplePayload := &testbench.Payload{Bytes: sampleData}
 
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 	// Give a chance for the dut to estimate RTO with RTT from the DATA-ACK.
 	// TODO(gvisor.dev/issue/2685) Estimate RTO during handshake, after which
 	// we can skip sending this ACK.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 
 	startRTO := time.Second
 	current := startRTO
 	first := time.Now()
 	dut.Send(acceptFd, sampleData, 0)
-	seq := tb.Uint32(uint32(*conn.RemoteSeqNum()))
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil {
+	seq := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, startRTO); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 	// Expect retransmits of the same segment.
 	for i := 0; i < 5; i++ {
 		start := time.Now()
-		if _, err := conn.ExpectData(&tb.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil {
+		if _, err := conn.ExpectData(&testbench.TCP{SeqNum: seq}, samplePayload, 2*current); err != nil {
 			t.Fatalf("expected a packet with payload %v: %s loop %d", samplePayload, err, i)
 		}
 		if i == 0 {
diff --git a/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go
new file mode 100644
index 000000000..90ab85419
--- /dev/null
+++ b/test/packetimpact/tests/tcp_send_window_sizes_piggyback_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_send_window_sizes_piggyback_test
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestSendWindowSizesPiggyback tests cases where segment sizes are close to
+// sender window size and checks for ACK piggybacking for each of those case.
+func TestSendWindowSizesPiggyback(t *testing.T) {
+	sampleData := []byte("Sample Data")
+	segmentSize := uint16(len(sampleData))
+	// Advertise receive window sizes that are lesser, equal to or greater than
+	// enqueued segment size and check for segment transmits. The test attempts
+	// to enqueue a segment on the dut before acknowledging previous segment and
+	// lets the dut piggyback any ACKs along with the enqueued segment.
+	for _, tt := range []struct {
+		description      string
+		windowSize       uint16
+		expectedPayload1 []byte
+		expectedPayload2 []byte
+		enqueue          bool
+	}{
+		// Expect the first segment to be split as it cannot be accomodated in
+		// the sender window. This means we need not enqueue a new segment after
+		// the first segment.
+		{"WindowSmallerThanSegment", segmentSize - 1, sampleData[:(segmentSize - 1)], sampleData[(segmentSize - 1):], false /* enqueue */},
+
+		{"WindowEqualToSegment", segmentSize, sampleData, sampleData, true /* enqueue */},
+
+		// Expect the second segment to not be split as its size is greater than
+		// the available sender window size. The segments should not be split
+		// when there is pending unacknowledged data and the segment-size is
+		// greater than available sender window.
+		{"WindowGreaterThanSegment", segmentSize + 1, sampleData, sampleData, true /* enqueue */},
+	} {
+		t.Run(fmt.Sprintf("%s%d", tt.description, tt.windowSize), func(t *testing.T) {
+			dut := testbench.NewDUT(t)
+			defer dut.TearDown()
+			listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+			defer dut.Close(listenFd)
+
+			conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort, WindowSize: testbench.Uint16(tt.windowSize)}, testbench.TCP{SrcPort: &remotePort})
+			defer conn.Close()
+
+			conn.Connect()
+			acceptFd, _ := dut.Accept(listenFd)
+			defer dut.Close(acceptFd)
+
+			dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
+
+			expectedTCP := testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}
+
+			dut.Send(acceptFd, sampleData, 0)
+			expectedPayload := testbench.Payload{Bytes: tt.expectedPayload1}
+			if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+
+			// Expect any enqueued segment to be transmitted by the dut along with
+			// piggybacked ACK for our data.
+
+			if tt.enqueue {
+				// Enqueue a segment for the dut to transmit.
+				dut.Send(acceptFd, sampleData, 0)
+			}
+
+			// Send ACK for the previous segment along with data for the dut to
+			// receive and ACK back. Sending this ACK would make room for the dut
+			// to transmit any enqueued segment.
+			conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh), WindowSize: testbench.Uint16(tt.windowSize)}, &testbench.Payload{Bytes: sampleData})
+
+			// Expect the dut to piggyback the ACK for received data along with
+			// the segment enqueued for transmit.
+			expectedPayload = testbench.Payload{Bytes: tt.expectedPayload2}
+			if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_should_piggyback_test.go b/test/packetimpact/tests/tcp_should_piggyback_test.go
deleted file mode 100644
index 0240dc2f9..000000000
--- a/test/packetimpact/tests/tcp_should_piggyback_test.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package tcp_should_piggyback_test
-
-import (
-	"flag"
-	"testing"
-	"time"
-
-	"golang.org/x/sys/unix"
-	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
-)
-
-func init() {
-	tb.RegisterFlags(flag.CommandLine)
-}
-
-func TestPiggyback(t *testing.T) {
-	dut := tb.NewDUT(t)
-	defer dut.TearDown()
-	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
-	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort, WindowSize: tb.Uint16(12)}, tb.TCP{SrcPort: &remotePort})
-	defer conn.Close()
-
-	conn.Handshake()
-	acceptFd, _ := dut.Accept(listenFd)
-	defer dut.Close(acceptFd)
-
-	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
-
-	sampleData := []byte("Sample Data")
-
-	dut.Send(acceptFd, sampleData, 0)
-	expectedTCP := tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}
-	expectedPayload := tb.Payload{Bytes: sampleData}
-	if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
-		t.Fatalf("Expected %v but didn't get one: %s", tb.Layers{&expectedTCP, &expectedPayload}, err)
-	}
-
-	// Cause DUT to send us more data as soon as we ACK their first data segment because we have
-	// a small window.
-	dut.Send(acceptFd, sampleData, 0)
-
-	// DUT should ACK our segment by piggybacking ACK to their outstanding data segment instead of
-	// sending a separate ACK packet.
-	conn.Send(expectedTCP, &expectedPayload)
-	if _, err := conn.ExpectData(&expectedTCP, &expectedPayload, time.Second); err != nil {
-		t.Fatalf("Expected %v but didn't get one: %s", tb.Layers{&expectedTCP, &expectedPayload}, err)
-	}
-}
diff --git a/test/packetimpact/tests/tcp_splitseg_mss_test.go b/test/packetimpact/tests/tcp_splitseg_mss_test.go
new file mode 100644
index 000000000..9350d0988
--- /dev/null
+++ b/test/packetimpact/tests/tcp_splitseg_mss_test.go
@@ -0,0 +1,71 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_splitseg_mss_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPSplitSegMSS lets the dut try to send segments larger than MSS.
+// It tests if the transmitted segments are capped at MSS and are split.
+func TestTCPSplitSegMSS(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	const mss = uint32(header.TCPDefaultMSS)
+	options := make([]byte, header.TCPOptionMSSLength)
+	header.EncodeMSSOption(mss, options)
+	conn.ConnectWithOptions(options)
+
+	acceptFD, _ := dut.Accept(listenFD)
+	defer dut.Close(acceptFD)
+
+	// Let the dut send a segment larger than MSS.
+	largeData := make([]byte, mss+1)
+	for i := 0; i < 2; i++ {
+		dut.Send(acceptFD, largeData, 0)
+		if i == 0 {
+			// On Linux, the initial segment goes out beyond MSS and the segment
+			// split occurs on retransmission. Call ExpectData to wait to
+			// receive the split segment.
+			if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		} else {
+			if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, &testbench.Payload{Bytes: largeData[:mss]}, time.Second); err != nil {
+				t.Fatalf("expected payload was not received: %s", err)
+			}
+		}
+		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+		if _, err := conn.ExpectNextData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: largeData[mss:]}, time.Second); err != nil {
+			t.Fatalf("expected payload was not received: %s", err)
+		}
+		conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	}
+}
diff --git a/test/packetimpact/tests/tcp_synrcvd_reset_test.go b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
new file mode 100644
index 000000000..7d5deab01
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synrcvd_reset_test.go
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_syn_reset_test
+
+import (
+	"flag"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestTCPSynRcvdReset tests transition from SYN-RCVD to CLOSED.
+func TestTCPSynRcvdReset(t *testing.T) {
+	dut := testbench.NewDUT(t)
+	defer dut.TearDown()
+	listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
+	defer dut.Close(listenFD)
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+	defer conn.Close()
+
+	// Expect dut connection to have transitioned to SYN-RCVD state.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN-ACK %s", err)
+	}
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)})
+	// Expect the connection to have transitioned SYN-RCVD to CLOSED.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST %s", err)
+	}
+}
diff --git a/test/packetimpact/tests/tcp_synsent_reset_test.go b/test/packetimpact/tests/tcp_synsent_reset_test.go
new file mode 100644
index 000000000..6898a2239
--- /dev/null
+++ b/test/packetimpact/tests/tcp_synsent_reset_test.go
@@ -0,0 +1,88 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_synsent_reset_test
+
+import (
+	"flag"
+	"net"
+	"testing"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+	tb.RegisterFlags(flag.CommandLine)
+}
+
+// dutSynSentState sets up the dut connection in SYN-SENT state.
+func dutSynSentState(t *testing.T) (*tb.DUT, *tb.TCPIPv4, uint16, uint16) {
+	dut := tb.NewDUT(t)
+
+	clientFD, clientPort := dut.CreateBoundSocket(unix.SOCK_STREAM|unix.SOCK_NONBLOCK, unix.IPPROTO_TCP, net.ParseIP(tb.RemoteIPv4))
+	port := uint16(9001)
+	conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &port, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &port})
+
+	sa := unix.SockaddrInet4{Port: int(port)}
+	copy(sa.Addr[:], net.IP(net.ParseIP(tb.LocalIPv4)).To4())
+	// Bring the dut to SYN-SENT state with a non-blocking connect.
+	dut.Connect(clientFD, &sa)
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN\n")
+	}
+
+	return &dut, &conn, port, clientPort
+}
+
+// TestTCPSynSentReset tests RFC793, p67: SYN-SENT to CLOSED transition.
+func TestTCPSynSentReset(t *testing.T) {
+	dut, conn, _, _ := dutSynSentState(t)
+	defer conn.Close()
+	defer dut.TearDown()
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst | header.TCPFlagAck)})
+	// Expect the connection to have closed.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST")
+	}
+}
+
+// TestTCPSynSentRcvdReset tests RFC793, p70, SYN-SENT to SYN-RCVD to CLOSED
+// transitions.
+func TestTCPSynSentRcvdReset(t *testing.T) {
+	dut, c, remotePort, clientPort := dutSynSentState(t)
+	defer dut.TearDown()
+	defer c.Close()
+
+	conn := tb.NewTCPIPv4(t, tb.TCP{SrcPort: &remotePort, DstPort: &clientPort}, tb.TCP{SrcPort: &clientPort, DstPort: &remotePort})
+	defer conn.Close()
+	// Initiate new SYN connection with the same port pair
+	// (simultaneous open case), expect the dut connection to move to
+	// SYN-RCVD state
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}, nil, time.Second); err != nil {
+		t.Fatalf("expected SYN-ACK %s\n", err)
+	}
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)})
+	// Expect the connection to have transitioned SYN-RCVD to CLOSED.
+	// TODO(gvisor.dev/issue/478): Check for TCP_INFO on the dut side.
+	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+		t.Fatalf("expected a TCP RST")
+	}
+}
diff --git a/test/packetimpact/tests/tcp_user_timeout_test.go b/test/packetimpact/tests/tcp_user_timeout_test.go
index ce31917ee..87e45d765 100644
--- a/test/packetimpact/tests/tcp_user_timeout_test.go
+++ b/test/packetimpact/tests/tcp_user_timeout_test.go
@@ -22,27 +22,27 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
-func sendPayload(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error {
+func sendPayload(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error {
 	sampleData := make([]byte, 100)
 	for i := range sampleData {
 		sampleData[i] = uint8(i)
 	}
 	conn.Drain()
 	dut.Send(fd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &tb.Payload{Bytes: sampleData}, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, &testbench.Payload{Bytes: sampleData}, time.Second); err != nil {
 		return fmt.Errorf("expected data but got none: %w", err)
 	}
 	return nil
 }
 
-func sendFIN(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error {
+func sendFIN(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error {
 	dut.Close(fd)
 	return nil
 }
@@ -59,20 +59,20 @@ func TestTCPUserTimeout(t *testing.T) {
 	} {
 		for _, ttf := range []struct {
 			description string
-			f           func(conn *tb.TCPIPv4, dut *tb.DUT, fd int32) error
+			f           func(conn *testbench.TCPIPv4, dut *testbench.DUT, fd int32) error
 		}{
 			{"AfterPayload", sendPayload},
 			{"AfterFIN", sendFIN},
 		} {
 			t.Run(tt.description+ttf.description, func(t *testing.T) {
 				// Create a socket, listen, TCP handshake, and accept.
-				dut := tb.NewDUT(t)
+				dut := testbench.NewDUT(t)
 				defer dut.TearDown()
 				listenFD, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 				defer dut.Close(listenFD)
-				conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+				conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 				defer conn.Close()
-				conn.Handshake()
+				conn.Connect()
 				acceptFD, _ := dut.Accept(listenFD)
 
 				if tt.userTimeout != 0 {
@@ -85,14 +85,14 @@ func TestTCPUserTimeout(t *testing.T) {
 
 				time.Sleep(tt.sendDelay)
 				conn.Drain()
-				conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+				conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 
 				// If TCP_USER_TIMEOUT was set and the above delay was longer than the
 				// TCP_USER_TIMEOUT then the DUT should send a RST in response to the
 				// testbench's packet.
 				expectRST := tt.userTimeout != 0 && tt.sendDelay > tt.userTimeout
 				expectTimeout := 5 * time.Second
-				got, err := conn.Expect(tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, expectTimeout)
+				got, err := conn.Expect(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, expectTimeout)
 				if expectRST && err != nil {
 					t.Errorf("expected RST packet within %s but got none: %s", expectTimeout, err)
 				}
diff --git a/test/packetimpact/tests/tcp_window_shrink_test.go b/test/packetimpact/tests/tcp_window_shrink_test.go
index 58ec1d740..576ec1a8b 100644
--- a/test/packetimpact/tests/tcp_window_shrink_test.go
+++ b/test/packetimpact/tests/tcp_window_shrink_test.go
@@ -21,53 +21,53 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func TestWindowShrink(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 	defer conn.Close()
 
-	conn.Handshake()
+	conn.Connect()
 	acceptFd, _ := dut.Accept(listenFd)
 	defer dut.Close(acceptFd)
 
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
-	samplePayload := &tb.Payload{Bytes: sampleData}
+	samplePayload := &testbench.Payload{Bytes: sampleData}
 
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
 
 	dut.Send(acceptFd, sampleData, 0)
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 	// We close our receiving window here
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
 
 	dut.Send(acceptFd, []byte("Sample Data"), 0)
 	// Note: There is another kind of zero-window probing which Windows uses (by sending one
 	// new byte at `RemoteSeqNum`), if netstack wants to go that way, we may want to change
 	// the following lines.
 	expectedRemoteSeqNum := *conn.RemoteSeqNum() - 1
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: tb.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: testbench.Uint32(uint32(expectedRemoteSeqNum))}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %v: %s", expectedRemoteSeqNum, err)
 	}
 }
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
index dd43a24db..54cee138f 100644
--- a/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
+++ b/test/packetimpact/tests/tcp_zero_window_probe_retransmit_test.go
@@ -21,39 +21,39 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestZeroWindowProbeRetransmit tests retransmits of zero window probes
 // to be sent at exponentially inreasing time intervals.
 func TestZeroWindowProbeRetransmit(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 	defer conn.Close()
 
-	conn.Handshake()
+	conn.Connect()
 	acceptFd, _ := dut.Accept(listenFd)
 	defer dut.Close(acceptFd)
 
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
-	samplePayload := &tb.Payload{Bytes: sampleData}
+	samplePayload := &testbench.Payload{Bytes: sampleData}
 
 	// Send and receive sample data to the dut.
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %s", err)
 	}
 
@@ -63,9 +63,9 @@ func TestZeroWindowProbeRetransmit(t *testing.T) {
 	// of the recorded first zero probe transmission duration.
 	//
 	// Advertize zero receive window again.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
-	probeSeq := tb.Uint32(uint32(*conn.RemoteSeqNum() - 1))
-	ackProbe := tb.Uint32(uint32(*conn.RemoteSeqNum()))
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
 
 	startProbeDuration := time.Second
 	current := startProbeDuration
@@ -79,7 +79,7 @@ func TestZeroWindowProbeRetransmit(t *testing.T) {
 		// Expect zero-window probe with a timeout which is a function of the typical
 		// first retransmission time. The retransmission times is supposed to
 		// exponentially increase.
-		if _, err := conn.ExpectData(&tb.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil {
+		if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*current); err != nil {
 			t.Fatalf("expected a probe with sequence number %v: loop %d", probeSeq, i)
 		}
 		if i == 0 {
@@ -92,13 +92,14 @@ func TestZeroWindowProbeRetransmit(t *testing.T) {
 			t.Fatalf("zero probe came sooner interval %d probe %d\n", p, i)
 		}
 		// Acknowledge the zero-window probes from the dut.
-		conn.Send(tb.TCP{AckNum: ackProbe, Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+		conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
 		current *= 2
 	}
 	// Advertize non-zero window.
-	conn.Send(tb.TCP{AckNum: ackProbe, Flags: tb.Uint8(header.TCPFlagAck)})
+	conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)})
 	// Expect the dut to recover and transmit data.
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.
+		TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 }
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_test.go b/test/packetimpact/tests/tcp_zero_window_probe_test.go
index 6c453505d..c9b3b7af2 100644
--- a/test/packetimpact/tests/tcp_zero_window_probe_test.go
+++ b/test/packetimpact/tests/tcp_zero_window_probe_test.go
@@ -21,41 +21,41 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestZeroWindowProbe tests few cases of zero window probing over the
 // same connection.
 func TestZeroWindowProbe(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 	defer conn.Close()
 
-	conn.Handshake()
+	conn.Connect()
 	acceptFd, _ := dut.Accept(listenFd)
 	defer dut.Close(acceptFd)
 
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
-	samplePayload := &tb.Payload{Bytes: sampleData}
+	samplePayload := &testbench.Payload{Bytes: sampleData}
 
 	start := time.Now()
 	// Send and receive sample data to the dut.
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 	sendTime := time.Now().Sub(start)
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %s", err)
 	}
 
@@ -63,16 +63,16 @@ func TestZeroWindowProbe(t *testing.T) {
 	//         probe to be sent.
 	//
 	// Advertize zero window to the dut.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
 
 	// Expected sequence number of the zero window probe.
-	probeSeq := tb.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
 	// Expected ack number of the ACK for the probe.
-	ackProbe := tb.Uint32(uint32(*conn.RemoteSeqNum()))
+	ackProbe := testbench.Uint32(uint32(*conn.RemoteSeqNum()))
 
 	// Expect there are no zero-window probes sent until there is data to be sent out
 	// from the dut.
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: probeSeq}, nil, 2*time.Second); err == nil {
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, 2*time.Second); err == nil {
 		t.Fatalf("unexpected a packet with sequence number %v: %s", probeSeq, err)
 	}
 
@@ -80,7 +80,7 @@ func TestZeroWindowProbe(t *testing.T) {
 	// Ask the dut to send out data.
 	dut.Send(acceptFd, sampleData, 0)
 	// Expect zero-window probe from the dut.
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %v: %s", probeSeq, err)
 	}
 	// Expect the probe to be sent after some time. Compare against the previous
@@ -94,9 +94,9 @@ func TestZeroWindowProbe(t *testing.T) {
 	//         and sends out the sample payload after the send window opens.
 	//
 	// Advertize non-zero window to the dut and ack the zero window probe.
-	conn.Send(tb.TCP{AckNum: ackProbe, Flags: tb.Uint8(header.TCPFlagAck)})
+	conn.Send(testbench.TCP{AckNum: ackProbe, Flags: testbench.Uint8(header.TCPFlagAck)})
 	// Expect the dut to recover and transmit data.
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: ackProbe}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
 
@@ -104,9 +104,9 @@ func TestZeroWindowProbe(t *testing.T) {
 	//         Check if the dut responds as we do for a similar probe sent to it.
 	//         Basically with sequence number to one byte behind the unacknowledged
 	//         sequence number.
-	p := tb.Uint32(uint32(*conn.LocalSeqNum()))
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), SeqNum: tb.Uint32(uint32(*conn.LocalSeqNum() - 1))})
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), AckNum: p}, nil, time.Second); err != nil {
+	p := testbench.Uint32(uint32(*conn.LocalSeqNum()))
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), SeqNum: testbench.Uint32(uint32(*conn.LocalSeqNum() - 1))})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), AckNum: p}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with ack number: %d: %s", p, err)
 	}
 }
diff --git a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go
index 193427fb9..749281d9d 100644
--- a/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go
+++ b/test/packetimpact/tests/tcp_zero_window_probe_usertimeout_test.go
@@ -21,39 +21,39 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 // TestZeroWindowProbeUserTimeout sanity tests user timeout when we are
 // retransmitting zero window probes.
 func TestZeroWindowProbeUserTimeout(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	listenFd, remotePort := dut.CreateListener(unix.SOCK_STREAM, unix.IPPROTO_TCP, 1)
 	defer dut.Close(listenFd)
-	conn := tb.NewTCPIPv4(t, tb.TCP{DstPort: &remotePort}, tb.TCP{SrcPort: &remotePort})
+	conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
 	defer conn.Close()
 
-	conn.Handshake()
+	conn.Connect()
 	acceptFd, _ := dut.Accept(listenFd)
 	defer dut.Close(acceptFd)
 
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_NODELAY, 1)
 
 	sampleData := []byte("Sample Data")
-	samplePayload := &tb.Payload{Bytes: sampleData}
+	samplePayload := &testbench.Payload{Bytes: sampleData}
 
 	// Send and receive sample data to the dut.
 	dut.Send(acceptFd, sampleData, 0)
-	if _, err := conn.ExpectData(&tb.TCP{}, samplePayload, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{}, samplePayload, time.Second); err != nil {
 		t.Fatalf("expected a packet with payload %v: %s", samplePayload, err)
 	}
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck | header.TCPFlagPsh)}, samplePayload)
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %s", err)
 	}
 
@@ -61,15 +61,15 @@ func TestZeroWindowProbeUserTimeout(t *testing.T) {
 	//         probe to be sent.
 	//
 	// Advertize zero window to the dut.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
 
 	// Expected sequence number of the zero window probe.
-	probeSeq := tb.Uint32(uint32(*conn.RemoteSeqNum() - 1))
+	probeSeq := testbench.Uint32(uint32(*conn.RemoteSeqNum() - 1))
 	start := time.Now()
 	// Ask the dut to send out data.
 	dut.Send(acceptFd, sampleData, 0)
 	// Expect zero-window probe from the dut.
-	if _, err := conn.ExpectData(&tb.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
+	if _, err := conn.ExpectData(&testbench.TCP{SeqNum: probeSeq}, nil, time.Second); err != nil {
 		t.Fatalf("expected a packet with sequence number %v: %s", probeSeq, err)
 	}
 	// Record the duration for first probe, the dut sends the zero window probe after
@@ -82,7 +82,7 @@ func TestZeroWindowProbeUserTimeout(t *testing.T) {
 	// Reduce the retransmit timeout.
 	dut.SetSockOptInt(acceptFd, unix.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int32(startProbeDuration.Milliseconds()))
 	// Advertize zero window again.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck), WindowSize: tb.Uint16(0)})
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck), WindowSize: testbench.Uint16(0)})
 	// Ask the dut to send out data that would trigger zero window probe retransmissions.
 	dut.Send(acceptFd, sampleData, 0)
 
@@ -91,8 +91,8 @@ func TestZeroWindowProbeUserTimeout(t *testing.T) {
 
 	// Expect the connection to have timed out and closed which would cause the dut
 	// to reply with a RST to the ACK we send.
-	conn.Send(tb.TCP{Flags: tb.Uint8(header.TCPFlagAck)})
-	if _, err := conn.ExpectData(&tb.TCP{Flags: tb.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
+	conn.Send(testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)})
+	if _, err := conn.ExpectData(&testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst)}, nil, time.Second); err != nil {
 		t.Fatalf("expected a TCP RST")
 	}
 }
diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh
deleted file mode 100755
index 706441cce..000000000
--- a/test/packetimpact/tests/test_runner.sh
+++ /dev/null
@@ -1,325 +0,0 @@
-#!/bin/bash
-
-# Copyright 2020 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run a packetimpact test.  Two docker containers are made, one for the
-# Device-Under-Test (DUT) and one for the test bench.  Each is attached with
-# two networks, one for control packets that aid the test and one for test
-# packets which are sent as part of the test and observed for correctness.
-
-set -euxo pipefail
-
-function failure() {
-  local lineno=$1
-  local msg=$2
-  local filename="$0"
-  echo "FAIL: $filename:$lineno: $msg"
-}
-trap 'failure ${LINENO} "$BASH_COMMAND"' ERR
-
-declare -r LONGOPTS="dut_platform:,posix_server_binary:,testbench_binary:,runtime:,tshark,extra_test_arg:,expect_failure"
-
-# Don't use declare below so that the error from getopt will end the script.
-PARSED=$(getopt --options "" --longoptions=$LONGOPTS --name "$0" -- "$@")
-
-eval set -- "$PARSED"
-
-declare -a EXTRA_TEST_ARGS
-
-while true; do
-  case "$1" in
-    --dut_platform)
-      # Either "linux" or "netstack".
-      declare -r DUT_PLATFORM="$2"
-      shift 2
-      ;;
-    --posix_server_binary)
-      declare -r POSIX_SERVER_BINARY="$2"
-      shift 2
-      ;;
-    --testbench_binary)
-      declare -r TESTBENCH_BINARY="$2"
-      shift 2
-      ;;
-    --runtime)
-      # Not readonly because there might be multiple --runtime arguments and we
-      # want to use just the last one.  Only used if --dut_platform is
-      # "netstack".
-      declare RUNTIME="$2"
-      shift 2
-      ;;
-    --tshark)
-      declare -r TSHARK="1"
-      shift 1
-      ;;
-    --extra_test_arg)
-      EXTRA_TEST_ARGS+="$2"
-      shift 2
-      ;;
-    --expect_failure)
-      declare -r EXPECT_FAILURE="1"
-      shift 1
-      ;;
-    --)
-      shift
-      break
-      ;;
-    *)
-      echo "Programming error"
-      exit 3
-  esac
-done
-
-# All the other arguments are scripts.
-declare -r scripts="$@"
-
-# Check that the required flags are defined in a way that is safe for "set -u".
-if [[ "${DUT_PLATFORM-}" == "netstack" ]]; then
-  if [[ -z "${RUNTIME-}" ]]; then
-    echo "FAIL: Missing --runtime argument: ${RUNTIME-}"
-    exit 2
-  fi
-  declare -r RUNTIME_ARG="--runtime ${RUNTIME}"
-elif [[ "${DUT_PLATFORM-}" == "linux" ]]; then
-  declare -r RUNTIME_ARG=""
-else
-  echo "FAIL: Bad or missing --dut_platform argument: ${DUT_PLATFORM-}"
-  exit 2
-fi
-if [[ ! -f "${POSIX_SERVER_BINARY-}" ]]; then
-  echo "FAIL: Bad or missing --posix_server_binary: ${POSIX_SERVER-}"
-  exit 2
-fi
-if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then
-  echo "FAIL: Bad or missing --testbench_binary: ${TESTBENCH_BINARY-}"
-  exit 2
-fi
-
-function new_net_prefix() {
-  # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24.
-  echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)"
-}
-
-# Variables specific to the control network and interface start with CTRL_.
-# Variables specific to the test network and interface start with TEST_.
-# Variables specific to the DUT start with DUT_.
-# Variables specific to the test bench start with TESTBENCH_.
-# Use random numbers so that test networks don't collide.
-declare CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-declare CTRL_NET_PREFIX=$(new_net_prefix)
-declare TEST_NET="test_net-${RANDOM}${RANDOM}"
-declare TEST_NET_PREFIX=$(new_net_prefix)
-# On both DUT and test bench, testing packets are on the eth2 interface.
-declare -r TEST_DEVICE="eth2"
-# Number of bits in the *_NET_PREFIX variables.
-declare -r NET_MASK="24"
-# Last bits of the DUT's IP address.
-declare -r DUT_NET_SUFFIX=".10"
-# Control port.
-declare -r CTRL_PORT="40000"
-# Last bits of the test bench's IP address.
-declare -r TESTBENCH_NET_SUFFIX=".20"
-declare -r TIMEOUT="60"
-declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact"
-
-# Make sure that docker is installed.
-docker --version
-
-function finish {
-  local cleanup_success=1
-
-  if [[ -z "${TSHARK-}" ]]; then
-    # Kill tcpdump so that it will flush output.
-    docker exec -t "${TESTBENCH}" \
-      killall tcpdump || \
-      cleanup_success=0
-  else
-    # Kill tshark so that it will flush output.
-    docker exec -t "${TESTBENCH}" \
-      killall tshark || \
-      cleanup_success=0
-  fi
-
-  for net in "${CTRL_NET}" "${TEST_NET}"; do
-    # Kill all processes attached to ${net}.
-    for docker_command in "kill" "rm"; do
-      (docker network inspect "${net}" \
-        --format '{{range $key, $value := .Containers}}{{$key}} {{end}}' \
-        | xargs -r docker "${docker_command}") || \
-        cleanup_success=0
-    done
-    # Remove the network.
-    docker network rm "${net}" || \
-      cleanup_success=0
-  done
-
-  if ((!$cleanup_success)); then
-    echo "FAIL: Cleanup command failed"
-    exit 4
-  fi
-}
-trap finish EXIT
-
-# Subnet for control packets between test bench and DUT.
-while ! docker network create \
-  "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do
-  sleep 0.1
-  CTRL_NET_PREFIX=$(new_net_prefix)
-  CTRL_NET="ctrl_net-${RANDOM}${RANDOM}"
-done
-
-# Subnet for the packets that are part of the test.
-while ! docker network create \
-  "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do
-  sleep 0.1
-  TEST_NET_PREFIX=$(new_net_prefix)
-  TEST_NET="test_net-${RANDOM}${RANDOM}"
-done
-
-docker pull "${IMAGE_TAG}"
-
-# Create the DUT container and connect to network.
-DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \
-  --cap-add NET_ADMIN \
-  --sysctl net.ipv6.conf.all.disable_ipv6=0 \
-  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
-docker network connect "${CTRL_NET}" \
-  --ip "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
-  || (docker kill ${DUT}; docker rm ${DUT}; false)
-docker network connect "${TEST_NET}" \
-  --ip "${TEST_NET_PREFIX}${DUT_NET_SUFFIX}" "${DUT}" \
-  || (docker kill ${DUT}; docker rm ${DUT}; false)
-docker start "${DUT}"
-
-# Create the test bench container and connect to network.
-TESTBENCH=$(docker create --privileged --rm \
-  --cap-add NET_ADMIN \
-  --sysctl net.ipv6.conf.all.disable_ipv6=0 \
-  --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG})
-docker network connect "${CTRL_NET}" \
-  --ip "${CTRL_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
-  || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
-docker network connect "${TEST_NET}" \
-  --ip "${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX}" "${TESTBENCH}" \
-  || (docker kill ${TESTBENCH}; docker rm ${TESTBENCH}; false)
-docker start "${TESTBENCH}"
-
-# Start the posix_server in the DUT.
-declare -r DOCKER_POSIX_SERVER_BINARY="/$(basename ${POSIX_SERVER_BINARY})"
-docker cp -L ${POSIX_SERVER_BINARY} "${DUT}:${DOCKER_POSIX_SERVER_BINARY}"
-
-docker exec -t "${DUT}" \
-  /bin/bash -c "${DOCKER_POSIX_SERVER_BINARY} \
-  --ip ${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
-  --port ${CTRL_PORT}" &
-
-# Because the Linux kernel receives the SYN-ACK but didn't send the SYN it will
-# issue a RST. To prevent this IPtables can be used to filter those out.
-docker exec "${TESTBENCH}" \
-  iptables -A INPUT -i ${TEST_DEVICE} -j DROP
-
-# Wait for the DUT server to come up.  Attempt to connect to it from the test
-# bench every 100 milliseconds until success.
-while ! docker exec "${TESTBENCH}" \
-  nc -zv "${CTRL_NET_PREFIX}${DUT_NET_SUFFIX}" "${CTRL_PORT}"; do
-  sleep 0.1
-done
-
-declare -r REMOTE_MAC=$(docker exec -t "${DUT}" ip link show \
-  "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
-declare -r LOCAL_MAC=$(docker exec -t "${TESTBENCH}" ip link show \
-  "${TEST_DEVICE}" | tail -1 | cut -d' ' -f6)
-declare REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show scope link \
-  "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-declare -r LOCAL_IPV6=$(docker exec -t "${TESTBENCH}" ip addr show scope link \
-  "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-
-# Netstack as DUT doesn't assign IPv6 addresses automatically so do it if
-# needed.  Convert the MAC address to an IPv6 link local address as described in
-# RFC 4291 page 20: https://tools.ietf.org/html/rfc4291#page-20
-if [[ -z "${REMOTE_IPV6}" ]]; then
-  # Split the octets of the MAC into an array of strings.
-  IFS=":" read -a REMOTE_OCTETS <<< "${REMOTE_MAC}"
-  # Flip the global bit.
-  REMOTE_OCTETS[0]=$(printf '%x' "$((0x${REMOTE_OCTETS[0]} ^ 2))")
-  # Add the IPv6 address.
-  docker exec "${DUT}" \
-    ip addr add $(printf 'fe80::%02x%02x:%02xff:fe%02x:%02x%02x/64' \
-    "0x${REMOTE_OCTETS[0]}" "0x${REMOTE_OCTETS[1]}" "0x${REMOTE_OCTETS[2]}" \
-    "0x${REMOTE_OCTETS[3]}" "0x${REMOTE_OCTETS[4]}" "0x${REMOTE_OCTETS[5]}") \
-    scope link \
-    dev "${TEST_DEVICE}"
-  # Re-extract the IPv6 address.
-  # TODO(eyalsoha): Add "scope link" below when netstack supports correctly
-  # creating link-local IPv6 addresses.
-  REMOTE_IPV6=$(docker exec -t "${DUT}" ip addr show \
-    "${TEST_DEVICE}" | grep inet6 | cut -d' ' -f6 | cut -d'/' -f1)
-fi
-
-declare -r DOCKER_TESTBENCH_BINARY="/$(basename ${TESTBENCH_BINARY})"
-docker cp -L "${TESTBENCH_BINARY}" "${TESTBENCH}:${DOCKER_TESTBENCH_BINARY}"
-
-if [[ -z "${TSHARK-}" ]]; then
-  # Run tcpdump in the test bench unbuffered, without dns resolution, just on
-  # the interface with the test packets.
-  docker exec -t "${TESTBENCH}" \
-    tcpdump -S -vvv -U -n -i "${TEST_DEVICE}" \
-    net "${TEST_NET_PREFIX}/24" or \
-    host "${REMOTE_IPV6}" or \
-    host "${LOCAL_IPV6}" &
-else
-  # Run tshark in the test bench unbuffered, without dns resolution, just on the
-  # interface with the test packets.
-  docker exec -t "${TESTBENCH}" \
-    tshark -V -l -n -i "${TEST_DEVICE}" \
-    -o tcp.check_checksum:TRUE \
-    -o udp.check_checksum:TRUE \
-    net "${TEST_NET_PREFIX}/24" or \
-    host "${REMOTE_IPV6}" or \
-    host "${LOCAL_IPV6}" &
-fi
-
-# tcpdump and tshark take time to startup
-sleep 3
-
-# Start a packetimpact test on the test bench.  The packetimpact test sends and
-# receives packets and also sends POSIX socket commands to the posix_server to
-# be executed on the DUT.
-docker exec \
-  -e XML_OUTPUT_FILE="/test.xml" \
-  -e TEST_TARGET \
-  -t "${TESTBENCH}" \
-  /bin/bash -c "${DOCKER_TESTBENCH_BINARY} \
-  ${EXTRA_TEST_ARGS[@]-} \
-  --posix_server_ip=${CTRL_NET_PREFIX}${DUT_NET_SUFFIX} \
-  --posix_server_port=${CTRL_PORT} \
-  --remote_ipv4=${TEST_NET_PREFIX}${DUT_NET_SUFFIX} \
-  --local_ipv4=${TEST_NET_PREFIX}${TESTBENCH_NET_SUFFIX} \
-  --remote_ipv6=${REMOTE_IPV6} \
-  --local_ipv6=${LOCAL_IPV6} \
-  --remote_mac=${REMOTE_MAC} \
-  --local_mac=${LOCAL_MAC} \
-  --device=${TEST_DEVICE}" && true
-declare -r TEST_RESULT="${?}"
-if [[ -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" != 0 ]]; then
-  echo 'FAIL: This test was expected to pass.'
-  exit ${TEST_RESULT}
-fi
-if [[ ! -z "${EXPECT_FAILURE-}" && "${TEST_RESULT}" == 0 ]]; then
-  echo 'FAIL: This test was expected to fail but passed.  Enable the test and' \
-    'mark the corresponding bug as fixed.'
-  exit 1
-fi
-echo PASS: No errors.
diff --git a/test/packetimpact/tests/udp_icmp_error_propagation_test.go b/test/packetimpact/tests/udp_icmp_error_propagation_test.go
index ca4df2ab0..b754918f6 100644
--- a/test/packetimpact/tests/udp_icmp_error_propagation_test.go
+++ b/test/packetimpact/tests/udp_icmp_error_propagation_test.go
@@ -26,11 +26,11 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 type connectionMode bool
@@ -59,12 +59,12 @@ func (e icmpError) String() string {
 	return "Unknown ICMP error"
 }
 
-func (e icmpError) ToICMPv4() *tb.ICMPv4 {
+func (e icmpError) ToICMPv4() *testbench.ICMPv4 {
 	switch e {
 	case portUnreachable:
-		return &tb.ICMPv4{Type: tb.ICMPv4Type(header.ICMPv4DstUnreachable), Code: tb.Uint8(header.ICMPv4PortUnreachable)}
+		return &testbench.ICMPv4{Type: testbench.ICMPv4Type(header.ICMPv4DstUnreachable), Code: testbench.Uint8(header.ICMPv4PortUnreachable)}
 	case timeToLiveExceeded:
-		return &tb.ICMPv4{Type: tb.ICMPv4Type(header.ICMPv4TimeExceeded), Code: tb.Uint8(header.ICMPv4TTLExceeded)}
+		return &testbench.ICMPv4{Type: testbench.ICMPv4Type(header.ICMPv4TimeExceeded), Code: testbench.Uint8(header.ICMPv4TTLExceeded)}
 	}
 	return nil
 }
@@ -76,8 +76,8 @@ type errorDetection struct {
 }
 
 type testData struct {
-	dut        *tb.DUT
-	conn       *tb.UDPIPv4
+	dut        *testbench.DUT
+	conn       *testbench.UDPIPv4
 	remoteFD   int32
 	remotePort uint16
 	cleanFD    int32
@@ -95,25 +95,26 @@ func wantErrno(c connectionMode, icmpErr icmpError) syscall.Errno {
 }
 
 // sendICMPError sends an ICMP error message in response to a UDP datagram.
-func sendICMPError(conn *tb.UDPIPv4, icmpErr icmpError, udp *tb.UDP) error {
+func sendICMPError(conn *testbench.UDPIPv4, icmpErr icmpError, udp *testbench.UDP) error {
+	layers := (*testbench.Connection)(conn).CreateFrame(nil)
+	layers = layers[:len(layers)-1]
+	ip, ok := udp.Prev().(*testbench.IPv4)
+	if !ok {
+		return fmt.Errorf("expected %s to be IPv4", udp.Prev())
+	}
 	if icmpErr == timeToLiveExceeded {
-		ip, ok := udp.Prev().(*tb.IPv4)
-		if !ok {
-			return fmt.Errorf("expected %s to be IPv4", udp.Prev())
-		}
 		*ip.TTL = 1
 		// Let serialization recalculate the checksum since we set the TTL
 		// to 1.
 		ip.Checksum = nil
-
-		// Note that the ICMP payload is valid in this case because the UDP
-		// payload is empty. If the UDP payload were not empty, the packet
-		// length during serialization may not be calculated correctly,
-		// resulting in a mal-formed packet.
-		conn.SendIP(icmpErr.ToICMPv4(), ip, udp)
-	} else {
-		conn.SendIP(icmpErr.ToICMPv4(), udp.Prev(), udp)
 	}
+	// Note that the ICMP payload is valid in this case because the UDP
+	// payload is empty. If the UDP payload were not empty, the packet
+	// length during serialization may not be calculated correctly,
+	// resulting in a mal-formed packet.
+	layers = append(layers, icmpErr.ToICMPv4(), ip, udp)
+
+	(*testbench.Connection)(conn).SendFrameStateless(layers)
 	return nil
 }
 
@@ -123,10 +124,10 @@ func sendICMPError(conn *tb.UDPIPv4, icmpErr icmpError, udp *tb.UDP) error {
 // first recv should succeed immediately.
 func testRecv(ctx context.Context, d testData) error {
 	// Check that receiving on the clean socket works.
-	d.conn.Send(tb.UDP{DstPort: &d.cleanPort})
+	d.conn.Send(testbench.UDP{DstPort: &d.cleanPort})
 	d.dut.Recv(d.cleanFD, 100, 0)
 
-	d.conn.Send(tb.UDP{})
+	d.conn.Send(testbench.UDP{})
 
 	if d.wantErrno != syscall.Errno(0) {
 		ctx, cancel := context.WithTimeout(ctx, time.Second)
@@ -151,7 +152,7 @@ func testRecv(ctx context.Context, d testData) error {
 func testSendTo(ctx context.Context, d testData) error {
 	// Check that sending on the clean socket works.
 	d.dut.SendTo(d.cleanFD, nil, 0, d.conn.LocalAddr())
-	if _, err := d.conn.Expect(tb.UDP{SrcPort: &d.cleanPort}, time.Second); err != nil {
+	if _, err := d.conn.Expect(testbench.UDP{SrcPort: &d.cleanPort}, time.Second); err != nil {
 		return fmt.Errorf("did not receive UDP packet from clean socket on DUT: %s", err)
 	}
 
@@ -169,7 +170,7 @@ func testSendTo(ctx context.Context, d testData) error {
 	}
 
 	d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr())
-	if _, err := d.conn.Expect(tb.UDP{}, time.Second); err != nil {
+	if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil {
 		return fmt.Errorf("did not receive UDP packet as expected: %s", err)
 	}
 	return nil
@@ -187,7 +188,7 @@ func testSockOpt(_ context.Context, d testData) error {
 
 	// Check that after clearing socket error, sending doesn't fail.
 	d.dut.SendTo(d.remoteFD, nil, 0, d.conn.LocalAddr())
-	if _, err := d.conn.Expect(tb.UDP{}, time.Second); err != nil {
+	if _, err := d.conn.Expect(testbench.UDP{}, time.Second); err != nil {
 		return fmt.Errorf("did not receive UDP packet as expected: %s", err)
 	}
 	return nil
@@ -223,7 +224,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) {
 				errorDetection{"SockOpt", false, testSockOpt},
 			} {
 				t.Run(fmt.Sprintf("%s/%s/%s", connect, icmpErr, errDetect.name), func(t *testing.T) {
-					dut := tb.NewDUT(t)
+					dut := testbench.NewDUT(t)
 					defer dut.TearDown()
 
 					remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
@@ -234,7 +235,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) {
 					cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
 					defer dut.Close(cleanFD)
 
-					conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+					conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 					defer conn.Close()
 
 					if connect {
@@ -243,7 +244,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) {
 					}
 
 					dut.SendTo(remoteFD, nil, 0, conn.LocalAddr())
-					udp, err := conn.Expect(tb.UDP{}, time.Second)
+					udp, err := conn.Expect(testbench.UDP{}, time.Second)
 					if err != nil {
 						t.Fatalf("did not receive message from DUT: %s", err)
 					}
@@ -258,7 +259,7 @@ func TestUDPICMPErrorPropagation(t *testing.T) {
 						// involved in the generation of the ICMP error. As such,
 						// interactions between it and the the DUT should be independent of
 						// the ICMP error at least at the port level.
-						connClean := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+						connClean := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 						defer connClean.Close()
 
 						errDetectConn = &connClean
@@ -281,7 +282,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) {
 			wantErrno := wantErrno(connect, icmpErr)
 
 			t.Run(fmt.Sprintf("%s/%s", connect, icmpErr), func(t *testing.T) {
-				dut := tb.NewDUT(t)
+				dut := testbench.NewDUT(t)
 				defer dut.TearDown()
 
 				remoteFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
@@ -292,7 +293,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) {
 				cleanFD, cleanPort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
 				defer dut.Close(cleanFD)
 
-				conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+				conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 				defer conn.Close()
 
 				if connect {
@@ -301,7 +302,7 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) {
 				}
 
 				dut.SendTo(remoteFD, nil, 0, conn.LocalAddr())
-				udp, err := conn.Expect(tb.UDP{}, time.Second)
+				udp, err := conn.Expect(testbench.UDP{}, time.Second)
 				if err != nil {
 					t.Fatalf("did not receive message from DUT: %s", err)
 				}
@@ -355,8 +356,8 @@ func TestICMPErrorDuringUDPRecv(t *testing.T) {
 					t.Fatal(err)
 				}
 
-				conn.Send(tb.UDP{DstPort: &cleanPort})
-				conn.Send(tb.UDP{})
+				conn.Send(testbench.UDP{DstPort: &cleanPort})
+				conn.Send(testbench.UDP{})
 				wg.Wait()
 			})
 		}
diff --git a/test/packetimpact/tests/udp_recv_multicast_test.go b/test/packetimpact/tests/udp_recv_multicast_test.go
index 0bae18ba3..77a9bfa1d 100644
--- a/test/packetimpact/tests/udp_recv_multicast_test.go
+++ b/test/packetimpact/tests/udp_recv_multicast_test.go
@@ -21,22 +21,20 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/tcpip"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func TestUDPRecvMulticast(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
 	defer dut.Close(boundFD)
-	conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 	defer conn.Close()
-	frame := conn.CreateFrame(&tb.UDP{}, &tb.Payload{Bytes: []byte("hello world")})
-	frame[1].(*tb.IPv4).DstAddr = tb.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))
-	conn.SendFrame(frame)
+	conn.SendIP(testbench.IPv4{DstAddr: testbench.Address(tcpip.Address(net.ParseIP("224.0.0.1").To4()))}, testbench.UDP{})
 	dut.Recv(boundFD, 100, 0)
 }
diff --git a/test/packetimpact/tests/udp_send_recv_dgram_test.go b/test/packetimpact/tests/udp_send_recv_dgram_test.go
index 350875a6f..a7db384ad 100644
--- a/test/packetimpact/tests/udp_send_recv_dgram_test.go
+++ b/test/packetimpact/tests/udp_send_recv_dgram_test.go
@@ -22,11 +22,11 @@ import (
 	"time"
 
 	"golang.org/x/sys/unix"
-	tb "gvisor.dev/gvisor/test/packetimpact/testbench"
+	"gvisor.dev/gvisor/test/packetimpact/testbench"
 )
 
 func init() {
-	tb.RegisterFlags(flag.CommandLine)
+	testbench.RegisterFlags(flag.CommandLine)
 }
 
 func generateRandomPayload(t *testing.T, n int) string {
@@ -39,11 +39,11 @@ func generateRandomPayload(t *testing.T, n int) string {
 }
 
 func TestUDPRecv(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
 	defer dut.Close(boundFD)
-	conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 	defer conn.Close()
 
 	testCases := []struct {
@@ -59,8 +59,7 @@ func TestUDPRecv(t *testing.T) {
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			frame := conn.CreateFrame(&tb.UDP{}, &tb.Payload{Bytes: []byte(tc.payload)})
-			conn.SendFrame(frame)
+			conn.Send(testbench.UDP{}, &testbench.Payload{Bytes: []byte(tc.payload)})
 			if got, want := string(dut.Recv(boundFD, int32(len(tc.payload)), 0)), tc.payload; got != want {
 				t.Fatalf("received payload does not match sent payload got: %s, want: %s", got, want)
 			}
@@ -69,11 +68,11 @@ func TestUDPRecv(t *testing.T) {
 }
 
 func TestUDPSend(t *testing.T) {
-	dut := tb.NewDUT(t)
+	dut := testbench.NewDUT(t)
 	defer dut.TearDown()
 	boundFD, remotePort := dut.CreateBoundSocket(unix.SOCK_DGRAM, unix.IPPROTO_UDP, net.ParseIP("0.0.0.0"))
 	defer dut.Close(boundFD)
-	conn := tb.NewUDPIPv4(t, tb.UDP{DstPort: &remotePort}, tb.UDP{SrcPort: &remotePort})
+	conn := testbench.NewUDPIPv4(t, testbench.UDP{DstPort: &remotePort}, testbench.UDP{SrcPort: &remotePort})
 	defer conn.Close()
 
 	testCases := []struct {
@@ -93,7 +92,7 @@ func TestUDPSend(t *testing.T) {
 			if got, want := int(dut.SendTo(boundFD, []byte(tc.payload), 0, conn.LocalAddr())), len(tc.payload); got != want {
 				t.Fatalf("short write got: %d, want: %d", got, want)
 			}
-			if _, err := conn.ExpectData(tb.UDP{SrcPort: &remotePort}, tb.Payload{Bytes: []byte(tc.payload)}, 1*time.Second); err != nil {
+			if _, err := conn.ExpectData(testbench.UDP{SrcPort: &remotePort}, testbench.Payload{Bytes: []byte(tc.payload)}, 1*time.Second); err != nil {
 				t.Fatal(err)
 			}
 		})
diff --git a/test/root/BUILD b/test/root/BUILD
index 639e293e3..a9e91ccd6 100644
--- a/test/root/BUILD
+++ b/test/root/BUILD
@@ -33,6 +33,7 @@ go_test(
     ],
     visibility = ["//:sandbox"],
     deps = [
+        "//pkg/cleanup",
         "//pkg/test/criutil",
         "//pkg/test/dockerutil",
         "//pkg/test/testutil",
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index 85007dcce..c138e02dc 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -30,10 +30,10 @@ import (
 	"testing"
 	"time"
 
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/test/criutil"
 	"gvisor.dev/gvisor/pkg/test/dockerutil"
 	"gvisor.dev/gvisor/pkg/test/testutil"
-	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 // Tests for crictl have to be run as root (rather than in a user namespace)
@@ -272,27 +272,20 @@ disabled_plugins = ["restart"]
 // * Runs containerd and waits for it to reach a "ready" state for testing.
 // * Returns a cleanup function that should be called at the end of the test.
 func setup(t *testing.T) (*criutil.Crictl, func(), error) {
-	var cleanups []func()
-	cleanupFunc := func() {
-		for i := len(cleanups) - 1; i >= 0; i-- {
-			cleanups[i]()
-		}
-	}
-	cleanup := specutils.MakeCleanup(cleanupFunc)
-	defer cleanup.Clean()
-
 	// Create temporary containerd root and state directories, and a socket
 	// via which crictl and containerd communicate.
 	containerdRoot, err := ioutil.TempDir(testutil.TmpDir(), "containerd-root")
 	if err != nil {
 		t.Fatalf("failed to create containerd root: %v", err)
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdRoot) })
+	cu := cleanup.Make(func() { os.RemoveAll(containerdRoot) })
+	defer cu.Clean()
+
 	containerdState, err := ioutil.TempDir(testutil.TmpDir(), "containerd-state")
 	if err != nil {
 		t.Fatalf("failed to create containerd state: %v", err)
 	}
-	cleanups = append(cleanups, func() { os.RemoveAll(containerdState) })
+	cu.Add(func() { os.RemoveAll(containerdState) })
 	sockAddr := filepath.Join(testutil.TmpDir(), "containerd-test.sock")
 
 	// We rewrite a configuration. This is based on the current docker
@@ -305,7 +298,7 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
 	if err != nil {
 		t.Fatalf("failed to write containerd config")
 	}
-	cleanups = append(cleanups, configCleanup)
+	cu.Add(configCleanup)
 
 	// Start containerd.
 	cmd := exec.Command(getContainerd(),
@@ -321,7 +314,8 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
 	stdout := &bytes.Buffer{}
 	cmd.Stderr = io.MultiWriter(startupW, stderr)
 	cmd.Stdout = io.MultiWriter(startupW, stdout)
-	cleanups = append(cleanups, func() {
+	cu.Add(func() {
+		// Log output in case of failure.
 		t.Logf("containerd stdout: %s", stdout.String())
 		t.Logf("containerd stderr: %s", stderr.String())
 	})
@@ -338,15 +332,14 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) {
 
 	// Kill must be the last cleanup (as it will be executed first).
 	cc := criutil.NewCrictl(t, sockAddr)
-	cleanups = append(cleanups, func() {
+	cu.Add(func() {
 		cc.CleanUp() // Remove tmp files, etc.
 		if err := testutil.KillCommand(cmd); err != nil {
 			log.Printf("error killing containerd: %v", err)
 		}
 	})
 
-	cleanup.Release()
-	return cc, cleanupFunc, nil
+	return cc, cu.Release(), nil
 }
 
 // httpGet GETs the contents of a file served from a pod on port 80.
diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go
index 9a3cecd97..4243eb59e 100644
--- a/test/root/oom_score_adj_test.go
+++ b/test/root/oom_score_adj_test.go
@@ -20,6 +20,7 @@ import (
 	"testing"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/cleanup"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -324,40 +325,26 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) {
-	var (
-		containers []*container.Container
-		cleanups   []func()
-	)
-	cleanups = append(cleanups, func() {
-		for _, c := range containers {
-			c.Destroy()
-		}
-	})
-	cleanupAll := func() {
-		for _, c := range cleanups {
-			c()
-		}
-	}
-	localClean := specutils.MakeCleanup(cleanupAll)
-	defer localClean.Clean()
+	var containers []*container.Container
 
 	// All containers must share the same root.
-	rootDir, cleanup, err := testutil.SetupRootDir()
+	rootDir, clean, err := testutil.SetupRootDir()
 	if err != nil {
 		t.Fatalf("error creating root dir: %v", err)
 	}
-	cleanups = append(cleanups, cleanup)
+	cu := cleanup.Make(clean)
+	defer cu.Clean()
 
 	// Point this to from the configuration.
 	conf := testutil.TestConfig(t)
 	conf.RootDir = rootDir
 
 	for i, spec := range specs {
-		bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+		bundleDir, clean, err := testutil.SetupBundleDir(spec)
 		if err != nil {
 			return nil, nil, fmt.Errorf("error setting up bundle: %v", err)
 		}
-		cleanups = append(cleanups, cleanup)
+		cu.Add(clean)
 
 		args := container.Args{
 			ID:        ids[i],
@@ -375,6 +362,5 @@ func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*contai
 		}
 	}
 
-	localClean.Release()
-	return containers, cleanupAll, nil
+	return containers, cu.Release(), nil
 }
diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl
index 0a75b158f..5a83f8060 100644
--- a/test/runner/defs.bzl
+++ b/test/runner/defs.bzl
@@ -60,7 +60,8 @@ def _syscall_test(
         network = "none",
         file_access = "exclusive",
         overlay = False,
-        add_uds_tree = False):
+        add_uds_tree = False,
+        vfs2 = False):
     # Prepend "runsc" to non-native platform names.
     full_platform = platform if platform == "native" else "runsc_" + platform
 
@@ -70,6 +71,8 @@ def _syscall_test(
         name += "_shared"
     if overlay:
         name += "_overlay"
+    if vfs2:
+        name += "_vfs2"
     if network != "none":
         name += "_" + network + "net"
 
@@ -90,6 +93,7 @@ def _syscall_test(
     # we figure out how to request ipv4 sockets on Guitar machines.
     if network == "host":
         tags.append("noguitar")
+        tags.append("block-network")
 
     # Disable off-host networking.
     tags.append("requires-net:loopback")
@@ -102,6 +106,7 @@ def _syscall_test(
         "--file-access=" + file_access,
         "--overlay=" + str(overlay),
         "--add-uds-tree=" + str(add_uds_tree),
+        "--vfs2=" + str(vfs2),
     ]
 
     # Call the rule above.
@@ -123,6 +128,7 @@ def syscall_test(
         add_overlay = False,
         add_uds_tree = False,
         add_hostinet = False,
+        vfs2 = False,
         tags = None):
     """syscall_test is a macro that will create targets for all platforms.
 
@@ -160,6 +166,29 @@ def syscall_test(
             tags = platform_tags + tags,
         )
 
+    vfs2_tags = list(tags)
+    if vfs2:
+        # Add tag to easily run VFS2 tests with --test_tag_filters=vfs2
+        vfs2_tags.append("vfs2")
+
+    else:
+        # Don't automatically run tests tests not yet passing.
+        vfs2_tags.append("manual")
+        vfs2_tags.append("noguitar")
+        vfs2_tags.append("notap")
+
+    _syscall_test(
+        test = test,
+        shard_count = shard_count,
+        size = size,
+        platform = default_platform,
+        use_tmpfs = use_tmpfs,
+        add_uds_tree = add_uds_tree,
+        tags = platforms[default_platform] + vfs2_tags,
+        vfs2 = True,
+    )
+
+    # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests.
     if add_overlay:
         _syscall_test(
             test = test,
@@ -172,6 +201,18 @@ def syscall_test(
             overlay = True,
         )
 
+    if add_hostinet:
+        _syscall_test(
+            test = test,
+            shard_count = shard_count,
+            size = size,
+            platform = default_platform,
+            use_tmpfs = use_tmpfs,
+            network = "host",
+            add_uds_tree = add_uds_tree,
+            tags = platforms[default_platform] + tags,
+        )
+
     if not use_tmpfs:
         # Also test shared gofer access.
         _syscall_test(
@@ -184,15 +225,14 @@ def syscall_test(
             tags = platforms[default_platform] + tags,
             file_access = "shared",
         )
-
-    if add_hostinet:
         _syscall_test(
             test = test,
             shard_count = shard_count,
             size = size,
             platform = default_platform,
             use_tmpfs = use_tmpfs,
-            network = "host",
             add_uds_tree = add_uds_tree,
-            tags = platforms[default_platform] + tags,
+            tags = platforms[default_platform] + vfs2_tags,
+            file_access = "shared",
+            vfs2 = True,
         )
diff --git a/test/runner/runner.go b/test/runner/runner.go
index 14c9cbc47..948e3a8ef 100644
--- a/test/runner/runner.go
+++ b/test/runner/runner.go
@@ -46,6 +46,7 @@ var (
 	useTmpfs   = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
 	fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
 	overlay    = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
+	vfs2       = flag.Bool("vfs2", false, "enable VFS2")
 	parallel   = flag.Bool("parallel", false, "run tests in parallel")
 	runscPath  = flag.String("runsc", "", "path to runsc binary")
 
@@ -146,6 +147,9 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 	if *overlay {
 		args = append(args, "-overlay")
 	}
+	if *vfs2 {
+		args = append(args, "-vfs2")
+	}
 	if *debug {
 		args = append(args, "-debug", "-log-packets=true")
 	}
@@ -204,7 +208,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
 			return
 		}
 		log.Warningf("%s: Got signal: %v", name, s)
-		done := make(chan bool)
+		done := make(chan bool, 1)
 		dArgs := append([]string{}, args...)
 		dArgs = append(dArgs, "-alsologtostderr=true", "debug", "--stacks", id)
 		go func(dArgs []string) {
@@ -341,11 +345,13 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
 		}
 	}
 
-	// Set environment variables that indicate we are
-	// running in gVisor with the given platform and network.
+	// Set environment variables that indicate we are running in gVisor with
+	// the given platform, network, and filesystem stack.
+	// TODO(gvisor.dev/issue/1487): Update this when the runner supports VFS2.
 	platformVar := "TEST_ON_GVISOR"
 	networkVar := "GVISOR_NETWORK"
-	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
+	vfsVar := "GVISOR_VFS"
+	env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network, vfsVar+"=VFS1")
 
 	// Remove env variables that cause the gunit binary to write output
 	// files, since they will stomp on eachother, and on the output files
diff --git a/test/runtimes/proctor/BUILD b/test/runtimes/proctor/BUILD
index da1e331e1..f76e2ddc0 100644
--- a/test/runtimes/proctor/BUILD
+++ b/test/runtimes/proctor/BUILD
@@ -21,7 +21,7 @@ go_test(
     size = "small",
     srcs = ["proctor_test.go"],
     library = ":proctor",
-    nocgo = 1,
+    pure = True,
     deps = [
         "//pkg/test/testutil",
     ],
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 9800a0cdf..d68afbe44 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -2,22 +2,33 @@ load("//test/runner:defs.bzl", "syscall_test")
 
 package(licenses = ["notice"])
 
-syscall_test(test = "//test/syscalls/linux:32bit_test")
+syscall_test(
+    test = "//test/syscalls/linux:32bit_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:accept_bind_stream_test")
+syscall_test(
+    test = "//test/syscalls/linux:accept_bind_stream_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:accept_bind_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:access_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:affinity_test")
+syscall_test(
+    test = "//test/syscalls/linux:affinity_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -28,11 +39,18 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:alarm_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:arch_prctl_test")
+syscall_test(
+    test = "//test/syscalls/linux:arch_prctl_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:bad_test")
+syscall_test(
+    test = "//test/syscalls/linux:bad_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -40,9 +58,15 @@ syscall_test(
     test = "//test/syscalls/linux:bind_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:brk_test")
+syscall_test(
+    test = "//test/syscalls/linux:brk_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -51,16 +75,19 @@ syscall_test(
     # involve much concurrency, TSAN's usefulness here is limited anyway.
     tags = ["nogotsan"],
     test = "//test/syscalls/linux:socket_stress_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chmod_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -68,6 +95,7 @@ syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:chown_test",
     use_tmpfs = True,  # chwon tests require gofer to be running as root.
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -75,45 +103,70 @@ syscall_test(
     test = "//test/syscalls/linux:chroot_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:clock_getres_test")
+syscall_test(
+    test = "//test/syscalls/linux:clock_getres_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:clock_gettime_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
+syscall_test(
+    test = "//test/syscalls/linux:clock_nanosleep_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:concurrency_test")
+syscall_test(
+    test = "//test/syscalls/linux:concurrency_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_uds_tree = True,
     test = "//test/syscalls/linux:connect_external_test",
     use_tmpfs = True,
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:creat_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:dev_test")
+syscall_test(
+    test = "//test/syscalls/linux:dev_test",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:dup_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:epoll_test")
+syscall_test(
+    test = "//test/syscalls/linux:epoll_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:eventfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:eventfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:exceptions_test")
+syscall_test(
+    test = "//test/syscalls/linux:exceptions_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     add_overlay = True,
     test = "//test/syscalls/linux:exec_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -122,7 +175,10 @@ syscall_test(
     test = "//test/syscalls/linux:exec_binary_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:exit_test")
+syscall_test(
+    test = "//test/syscalls/linux:exit_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -134,11 +190,15 @@ syscall_test(
     test = "//test/syscalls/linux:fallocate_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:fault_test")
+syscall_test(
+    test = "//test/syscalls/linux:fault_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:fchdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -152,11 +212,20 @@ syscall_test(
     test = "//test/syscalls/linux:flock_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:fork_test")
+syscall_test(
+    test = "//test/syscalls/linux:fork_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:fpsig_fork_test")
+syscall_test(
+    test = "//test/syscalls/linux:fpsig_fork_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:fpsig_nested_test")
+syscall_test(
+    test = "//test/syscalls/linux:fpsig_nested_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -167,20 +236,33 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:futex_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:getcpu_host_test")
+syscall_test(
+    test = "//test/syscalls/linux:getcpu_host_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:getcpu_test")
+syscall_test(
+    test = "//test/syscalls/linux:getcpu_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:getdents_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:getrandom_test")
+syscall_test(
+    test = "//test/syscalls/linux:getrandom_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:getrusage_test")
+syscall_test(
+    test = "//test/syscalls/linux:getrusage_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
@@ -196,15 +278,20 @@ syscall_test(
 
 syscall_test(
     test = "//test/syscalls/linux:iptables_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 5,
     test = "//test/syscalls/linux:itimer_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:kill_test")
+syscall_test(
+    test = "//test/syscalls/linux:kill_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
@@ -215,19 +302,33 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:lseek_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:madvise_test")
+syscall_test(
+    test = "//test/syscalls/linux:madvise_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:memory_accounting_test")
+syscall_test(
+    test = "//test/syscalls/linux:memory_accounting_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:mempolicy_test")
+syscall_test(
+    test = "//test/syscalls/linux:mempolicy_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:mincore_test")
+syscall_test(
+    test = "//test/syscalls/linux:mincore_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:mkdir_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -249,20 +350,29 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:mremap_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:msync_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:munmap_test")
+syscall_test(
+    test = "//test/syscalls/linux:munmap_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:network_namespace_test")
+syscall_test(
+    test = "//test/syscalls/linux:network_namespace_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:open_create_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -270,40 +380,73 @@ syscall_test(
     test = "//test/syscalls/linux:open_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:packet_socket_raw_test")
+syscall_test(
+    test = "//test/syscalls/linux:packet_socket_raw_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:packet_socket_test")
+syscall_test(
+    test = "//test/syscalls/linux:packet_socket_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
+syscall_test(
+    test = "//test/syscalls/linux:partial_bad_buffer_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:pause_test")
+syscall_test(
+    test = "//test/syscalls/linux:pause_test",
+    vfs2 = "True",
+)
+
+syscall_test(
+    size = "medium",
+    # Takes too long under gotsan to run.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:ping_socket_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
     add_overlay = True,
     shard_count = 5,
     test = "//test/syscalls/linux:pipe_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:poll_test")
+syscall_test(
+    test = "//test/syscalls/linux:poll_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:ppoll_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:prctl_setuid_test")
+syscall_test(
+    test = "//test/syscalls/linux:prctl_setuid_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:prctl_test")
+syscall_test(
+    test = "//test/syscalls/linux:prctl_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pread64_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:preadv_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -311,36 +454,56 @@ syscall_test(
     test = "//test/syscalls/linux:preadv2_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:priority_test")
+syscall_test(
+    test = "//test/syscalls/linux:priority_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:proc_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_net_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_oomscore_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_smaps_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_pid_uid_gid_map_test",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:pselect_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:ptrace_test")
+syscall_test(
+    test = "//test/syscalls/linux:ptrace_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:pty_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     test = "//test/syscalls/linux:pty_root_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -351,17 +514,28 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:pwrite64_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_hdrincl_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_hdrincl_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_icmp_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_icmp_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:raw_socket_ipv4_test")
+syscall_test(
+    test = "//test/syscalls/linux:raw_socket_ipv4_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:read_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -373,12 +547,14 @@ syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:readv_socket_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     add_overlay = True,
     test = "//test/syscalls/linux:readv_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -387,25 +563,50 @@ syscall_test(
     test = "//test/syscalls/linux:rename_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:rlimits_test")
+syscall_test(
+    test = "//test/syscalls/linux:rlimits_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:rseq_test")
+syscall_test(
+    test = "//test/syscalls/linux:rseq_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:rtsignal_test")
+syscall_test(
+    test = "//test/syscalls/linux:rtsignal_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:signalfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:signalfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sched_test")
+syscall_test(
+    test = "//test/syscalls/linux:sched_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sched_yield_test")
+syscall_test(
+    test = "//test/syscalls/linux:sched_yield_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:seccomp_test")
+syscall_test(
+    test = "//test/syscalls/linux:seccomp_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:select_test")
+syscall_test(
+    test = "//test/syscalls/linux:select_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     shard_count = 20,
     test = "//test/syscalls/linux:semaphore_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -421,49 +622,68 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:splice_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:sigaction_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigaction_test",
+    vfs2 = "True",
+)
 
 # TODO(b/119826902): Enable once the test passes in runsc.
-# syscall_test(test = "//test/syscalls/linux:sigaltstack_test")
+# syscall_test(vfs2="True",test = "//test/syscalls/linux:sigaltstack_test")
 
-syscall_test(test = "//test/syscalls/linux:sigiret_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigiret_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sigprocmask_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigprocmask_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:sigstop_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:sigtimedwait_test")
+syscall_test(
+    test = "//test/syscalls/linux:sigtimedwait_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:shm_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_abstract_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_abstract_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_domain_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_domain_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -488,59 +708,99 @@ syscall_test(
 syscall_test(
     size = "large",
     shard_count = 50,
+    # Takes too long for TSAN. Creates a lot of TCP sockets.
+    tags = ["nogotsan"],
+    test = "//test/syscalls/linux:socket_inet_loopback_nogotsan_test",
+)
+
+syscall_test(
+    size = "large",
+    shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_generic_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_tcp_udp_generic_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ip_udp_loopback_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "large",
     shard_count = 50,
     test = "//test/syscalls/linux:socket_ip_udp_loopback_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_ipv4_udp_unbound_loopback_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:socket_ip_unbound_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_ip_unbound_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netdevice_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netdevice_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_route_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_route_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_netlink_uevent_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_netlink_uevent_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_blocking_local_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_blocking_local_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_blocking_ip_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_blocking_ip_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_local_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_non_stream_blocking_local_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:socket_non_stream_blocking_udp_test")
+syscall_test(
+    test = "//test/syscalls/linux:socket_non_stream_blocking_udp_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "large",
@@ -550,6 +810,7 @@ syscall_test(
 syscall_test(
     size = "large",
     test = "//test/syscalls/linux:socket_stream_blocking_tcp_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -572,6 +833,7 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_dgram_non_blocking_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -579,6 +841,7 @@ syscall_test(
     add_overlay = True,
     shard_count = 50,
     test = "//test/syscalls/linux:socket_unix_pair_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -596,11 +859,13 @@ syscall_test(
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_unbound_abstract_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:socket_unix_unbound_dgram_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -612,6 +877,7 @@ syscall_test(
     size = "medium",
     shard_count = 10,
     test = "//test/syscalls/linux:socket_unix_unbound_seqpacket_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -623,6 +889,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:statfs_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -633,6 +900,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:stat_times_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -648,6 +916,7 @@ syscall_test(
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:sync_test",
+    vfs2 = "True",
 )
 
 syscall_test(
@@ -655,86 +924,151 @@ syscall_test(
     test = "//test/syscalls/linux:sync_file_range_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:sysinfo_test")
+syscall_test(
+    test = "//test/syscalls/linux:sysinfo_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:syslog_test")
+syscall_test(
+    test = "//test/syscalls/linux:syslog_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:sysret_test")
+syscall_test(
+    test = "//test/syscalls/linux:sysret_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 10,
     test = "//test/syscalls/linux:tcp_socket_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:tgkill_test")
+syscall_test(
+    test = "//test/syscalls/linux:tgkill_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:timerfd_test")
+syscall_test(
+    test = "//test/syscalls/linux:timerfd_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:timers_test")
+syscall_test(
+    test = "//test/syscalls/linux:timers_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:time_test")
+syscall_test(
+    test = "//test/syscalls/linux:time_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:tkill_test")
+syscall_test(
+    test = "//test/syscalls/linux:tkill_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:truncate_test",
 )
 
-syscall_test(test = "//test/syscalls/linux:tuntap_test")
+syscall_test(
+    test = "//test/syscalls/linux:tuntap_test",
+)
 
 syscall_test(
     add_hostinet = True,
     test = "//test/syscalls/linux:tuntap_hostinet_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:udp_bind_test")
+syscall_test(
+    test = "//test/syscalls/linux:udp_bind_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     add_hostinet = True,
     shard_count = 10,
     test = "//test/syscalls/linux:udp_socket_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:uidgid_test")
+syscall_test(
+    test = "//test/syscalls/linux:uidgid_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:uname_test")
+syscall_test(
+    test = "//test/syscalls/linux:uname_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:unlink_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:unshare_test")
+syscall_test(
+    test = "//test/syscalls/linux:unshare_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:utimes_test")
+syscall_test(
+    test = "//test/syscalls/linux:utimes_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     test = "//test/syscalls/linux:vdso_clock_gettime_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:vdso_test")
+syscall_test(
+    test = "//test/syscalls/linux:vdso_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:vsyscall_test")
+syscall_test(
+    test = "//test/syscalls/linux:vsyscall_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:vfork_test")
+syscall_test(
+    test = "//test/syscalls/linux:vfork_test",
+    vfs2 = "True",
+)
 
 syscall_test(
     size = "medium",
     shard_count = 5,
     test = "//test/syscalls/linux:wait_test",
+    vfs2 = "True",
 )
 
 syscall_test(
     add_overlay = True,
     test = "//test/syscalls/linux:write_test",
+    vfs2 = "True",
 )
 
-syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_unix_test",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_tcp_test",
+    vfs2 = "True",
+)
 
-syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
+syscall_test(
+    test = "//test/syscalls/linux:proc_net_udp_test",
+    vfs2 = "True",
+)
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 3f12a04c0..6d051f48b 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -802,10 +802,13 @@ cc_binary(
     ],
     linkstatic = 1,
     deps = [
+        ":socket_test_util",
         "//test/util:file_descriptor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         gtest,
+        "//test/util:epoll_util",
+        "//test/util:eventfd_util",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
@@ -948,6 +951,7 @@ cc_binary(
         "//test/util:epoll_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
+        "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
@@ -1379,7 +1383,7 @@ cc_binary(
     srcs = ["partial_bad_buffer.cc"],
     linkstatic = 1,
     deps = [
-        "//test/syscalls/linux:socket_test_util",
+        ":socket_test_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
         "@com_google_absl//absl/time",
@@ -1408,6 +1412,21 @@ cc_binary(
 )
 
 cc_binary(
+    name = "ping_socket_test",
+    testonly = 1,
+    srcs = ["ping_socket.cc"],
+    linkstatic = 1,
+    deps = [
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        gtest,
+        "//test/util:save_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+    ],
+)
+
+cc_binary(
     name = "pipe_test",
     testonly = 1,
     srcs = ["pipe.cc"],
@@ -2777,6 +2796,26 @@ cc_binary(
 )
 
 cc_binary(
+    name = "socket_inet_loopback_nogotsan_test",
+    testonly = 1,
+    srcs = ["socket_inet_loopback_nogotsan.cc"],
+    linkstatic = 1,
+    deps = [
+        ":ip_socket_test_util",
+        ":socket_test_util",
+        "//test/util:file_descriptor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        gtest,
+        "//test/util:posix_error",
+        "//test/util:save_util",
+        "//test/util:test_main",
+        "//test/util:test_util",
+        "//test/util:thread_util",
+    ],
+)
+
+cc_binary(
     name = "socket_netlink_test",
     testonly = 1,
     srcs = ["socket_netlink.cc"],
@@ -3458,7 +3497,7 @@ cc_binary(
     deps = [
         ":socket_test_util",
         gtest,
-        "//test/syscalls/linux:socket_netlink_route_util",
+        ":socket_netlink_route_util",
         "//test/util:capability_util",
         "//test/util:file_descriptor",
         "//test/util:fs_util",
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index e08c578f0..f65a14fb8 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <stdio.h>
+#include <sys/socket.h>
 #include <sys/un.h>
 
 #include <algorithm>
@@ -141,6 +142,47 @@ TEST_P(AllSocketPairTest, Connect) {
               SyscallSucceeds());
 }
 
+TEST_P(AllSocketPairTest, ConnectWithWrongType) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int type;
+  socklen_t typelen = sizeof(type);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), SOL_SOCKET, SO_TYPE, &type, &typelen),
+      SyscallSucceeds());
+  switch (type) {
+    case SOCK_STREAM:
+      type = SOCK_SEQPACKET;
+      break;
+    case SOCK_SEQPACKET:
+      type = SOCK_STREAM;
+      break;
+  }
+
+  const FileDescriptor another_socket =
+      ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, type, 0));
+
+  ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+                   sockets->first_addr_size()),
+              SyscallSucceeds());
+
+  ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+  if (sockets->first_addr()->sa_data[0] != 0) {
+    ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+                        sockets->first_addr_size()),
+                SyscallFailsWithErrno(EPROTOTYPE));
+  } else {
+    ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+                        sockets->first_addr_size()),
+                SyscallFailsWithErrno(ECONNREFUSED));
+  }
+
+  ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+                      sockets->first_addr_size()),
+              SyscallSucceeds());
+}
+
 TEST_P(AllSocketPairTest, ConnectNonListening) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 12c9b05ca..e09afafe9 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -673,6 +673,33 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
   EXPECT_EQ(execve_errno, ELOOP);
 }
 
+TEST(ExecveatTest, UnshareFiles) {
+  TempPath tempFile = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "bar", 0755));
+  const FileDescriptor fd_closed_on_exec =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
+
+  pid_t child;
+  EXPECT_THAT(child = syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES,
+                              0, 0, 0, 0),
+              SyscallSucceeds());
+  if (child == 0) {
+    ExecveArray argv = {"test"};
+    ExecveArray envp;
+    ASSERT_THAT(
+        execve(RunfilePath(kBasicWorkload).c_str(), argv.get(), envp.get()),
+        SyscallSucceeds());
+    _exit(1);
+  }
+
+  int status;
+  ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+  EXPECT_EQ(status, 0);
+
+  struct stat st;
+  EXPECT_THAT(fstat(fd_closed_on_exec.get(), &st), SyscallSucceeds());
+}
+
 TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
   std::string parent_dir = "/tmp";
   TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
index 1a9f203b9..18d2f22c1 100644
--- a/test/syscalls/linux/exec_binary.cc
+++ b/test/syscalls/linux/exec_binary.cc
@@ -438,7 +438,12 @@ TEST(ElfTest, MissingText) {
   ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
               SyscallSucceedsWithValue(child));
   // It runs off the end of the zeroes filling the end of the page.
+#if defined(__x86_64__)
   EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+#elif defined(__aarch64__)
+  // 0 is an invalid instruction opcode on arm64.
+  EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGILL) << status;
+#endif
 }
 
 // Typical ELF with a data + bss segment
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
index 3ecb8db8e..638a93979 100644
--- a/test/syscalls/linux/flock.cc
+++ b/test/syscalls/linux/flock.cc
@@ -21,6 +21,7 @@
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/socket_test_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
@@ -34,11 +35,6 @@ namespace {
 
 class FlockTest : public FileTest {};
 
-TEST_F(FlockTest, BadFD) {
-  // EBADF: fd is not an open file descriptor.
-  ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF));
-}
-
 TEST_F(FlockTest, InvalidOpCombinations) {
   // The operation cannot be both exclusive and shared.
   EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_SH | LOCK_NB),
@@ -57,15 +53,6 @@ TEST_F(FlockTest, NoOperationSpecified) {
               SyscallFailsWithErrno(EINVAL));
 }
 
-TEST(FlockTestNoFixture, FlockSupportsPipes) {
-  int fds[2];
-  ASSERT_THAT(pipe(fds), SyscallSucceeds());
-
-  EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds());
-  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
-  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
-}
-
 TEST_F(FlockTest, TestSimpleExLock) {
   // Test that we can obtain an exclusive lock (no other holders)
   // and that we can unlock it.
@@ -583,6 +570,66 @@ TEST_F(FlockTest, BlockingLockFirstExclusiveSecondExclusive_NoRandomSave) {
   EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
 }
 
+TEST(FlockTestNoFixture, BadFD) {
+  // EBADF: fd is not an open file descriptor.
+  ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FlockTestNoFixture, FlockDir) {
+  auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000));
+  EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockSymlink) {
+  // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH
+  // is supported.
+  SKIP_IF(IsRunningOnGvisor());
+
+  auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path()));
+
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000));
+  EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FlockTestNoFixture, FlockProc) {
+  auto fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000));
+  EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockPipe) {
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+  EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds());
+  // Check that the pipe was locked above.
+  EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EAGAIN));
+
+  EXPECT_THAT(flock(fds[0], LOCK_UN), SyscallSucceeds());
+  EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+  EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+  EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockSocket) {
+  int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+  ASSERT_THAT(sock, SyscallSucceeds());
+
+  struct sockaddr_un addr =
+      ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX));
+  ASSERT_THAT(
+      bind(sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+      SyscallSucceeds());
+
+  EXPECT_THAT(flock(sock, LOCK_EX | LOCK_NB), SyscallSucceeds());
+  EXPECT_THAT(close(sock), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 0e13ad190..1d1a7171d 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -19,6 +19,7 @@
 #include <sys/inotify.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <sys/xattr.h>
 
 #include <atomic>
 #include <list>
@@ -33,6 +34,7 @@
 #include "test/util/epoll_util.h"
 #include "test/util/file_descriptor.h"
 #include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
 #include "test/util/thread_util.h"
@@ -335,6 +337,11 @@ TEST(Inotify, InotifyFdNotWritable) {
   EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF));
 }
 
+TEST(Inotify, InitFlags) {
+  EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds());
+  EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(Inotify, NonBlockingReadReturnsEagain) {
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -395,7 +402,7 @@ TEST(Inotify, CanDeleteFileAfterRemovingWatch) {
   file1.reset();
 }
 
-TEST(Inotify, CanRemoveWatchAfterDeletingFile) {
+TEST(Inotify, RemoveWatchAfterDeletingFileFails) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
@@ -491,17 +498,23 @@ TEST(Inotify, DeletingChildGeneratesEvents) {
                     Event(IN_DELETE, root_wd, Basename(file1_path))}));
 }
 
+// Creating a file in "parent/child" should generate events for child, but not
+// parent.
 TEST(Inotify, CreatingFileGeneratesEvents) {
-  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath child =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
 
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
   const int wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+      InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
 
   // Create a new file in the directory.
   const TempPath file1 =
-      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path()));
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
 
@@ -554,6 +567,47 @@ TEST(Inotify, WritingFileGeneratesModifyEvent) {
   ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))}));
 }
 
+TEST(Inotify, SizeZeroReadWriteGeneratesNothing) {
+  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file1 =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+  const FileDescriptor file1_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+  // Read from the empty file.
+  int val;
+  ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)),
+              SyscallSucceedsWithValue(0));
+
+  // Write zero bytes.
+  ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0));
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, FailedFileCreationGeneratesNoEvents) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const std::string dir_path = dir.path();
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), dir_path, IN_ALL_EVENTS));
+
+  const char* p = dir_path.c_str();
+  ASSERT_THAT(mkdir(p, 0777), SyscallFails());
+  ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails());
+  ASSERT_THAT(symlink(p, p), SyscallFails());
+  ASSERT_THAT(link(p, p), SyscallFails());
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({}));
+}
+
 TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const FileDescriptor fd =
@@ -602,7 +656,7 @@ TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) {
                    Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))}));
 }
 
-TEST(Inotify, WatchTargetDeletionGeneratesEvent) {
+TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
@@ -1228,7 +1282,7 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
       InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
 
   const int rc = link(file1.path().c_str(), link1.path().c_str());
-  // link(2) is only supported on tmpfs in the sandbox.
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
   SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
           (errno == EPERM || errno == ENOENT));
   ASSERT_THAT(rc, SyscallSucceeds());
@@ -1322,21 +1376,27 @@ TEST(Inotify, HardlinksReuseSameWatch) {
                     Event(IN_DELETE, root_wd, Basename(file1_path))}));
 }
 
+// Calling mkdir within "parent/child" should generate an event for child, but
+// not parent.
 TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) {
-  const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath child =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
-  const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
-      InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
+  const int child_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
 
-  const TempPath dir1(NewTempAbsPathInDir(root.path()));
+  const TempPath dir1(NewTempAbsPathInDir(child.path()));
   ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds());
 
   const std::vector<Event> events =
       ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
   ASSERT_THAT(
       events,
-      Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))}));
+      Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))}));
 }
 
 TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) {
@@ -1596,7 +1656,44 @@ TEST(Inotify, EpollNoDeadlock) {
   }
 }
 
-TEST(Inotify, SpliceEvent) {
+// On Linux, inotify behavior is not very consistent with splice(2). We try our
+// best to emulate Linux for very basic calls to splice.
+TEST(Inotify, SpliceOnWatchTarget) {
+  int pipes[2];
+  ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+      dir.path(), "some content", TempPath::kDefaultFileMode));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+  const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+  const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+  EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, 1, /*flags=*/0),
+              SyscallSucceedsWithValue(1));
+
+  // Surprisingly, events are not generated in Linux if we read from a file.
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  ASSERT_THAT(events, Are({}));
+
+  EXPECT_THAT(splice(pipes[0], nullptr, fd.get(), nullptr, 1, /*flags=*/0),
+              SyscallSucceedsWithValue(1));
+
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  ASSERT_THAT(events, Are({
+                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                          Event(IN_MODIFY, file_wd),
+                      }));
+}
+
+TEST(Inotify, SpliceOnInotifyFD) {
   int pipes[2];
   ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
 
@@ -1624,6 +1721,315 @@ TEST(Inotify, SpliceEvent) {
   ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
 }
 
+// Watches on a parent should not be triggered by actions on a hard link to one
+// of its children that has a different parent.
+TEST(Inotify, LinkOnOtherParent) {
+  const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+  std::string link_path = NewTempAbsPathInDir(dir2.path());
+
+  const int rc = link(file.path().c_str(), link_path.c_str());
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+          (errno == EPERM || errno == ENOENT));
+  ASSERT_THAT(rc, SyscallSucceeds());
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+  // Perform various actions on the link outside of dir1, which should trigger
+  // no inotify events.
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR));
+  int val = 0;
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+  ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+}
+
+TEST(Inotify, Xattr) {
+  // TODO(gvisor.dev/issue/1636): Support extended attributes in runsc gofer.
+  SKIP_IF(IsRunningOnGvisor());
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const std::string path = file.path();
+  const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_RDWR));
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), path, IN_ALL_EVENTS));
+
+  const char* cpath = path.c_str();
+  const char* name = "user.test";
+  int val = 123;
+  ASSERT_THAT(setxattr(cpath, name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(getxattr(cpath, name, &val, sizeof(val)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  char list[100];
+  ASSERT_THAT(listxattr(cpath, list, sizeof(list)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(removexattr(cpath, name), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(fsetxattr(fd.get(), name, &val, sizeof(val), /*flags=*/0),
+              SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+  ASSERT_THAT(fgetxattr(fd.get(), name, &val, sizeof(val)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(flistxattr(fd.get(), list, sizeof(list)), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({}));
+
+  ASSERT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+}
+
+TEST(Inotify, Exec) {
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateSymlinkTo(dir.path(), "/bin/true"));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS));
+
+  // Perform exec.
+  ScopedThread t([&bin]() {
+    ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr),
+                SyscallSucceeds());
+  });
+  t.Join();
+
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)}));
+}
+
+// Watches without IN_EXCL_UNLINK, should continue to emit events for file
+// descriptors after their corresponding files have been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+      TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+  const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+  int val = 0;
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_ATTRIB, file_wd),
+                          Event(IN_DELETE, dir_wd, Basename(file.path())),
+                          Event(IN_ACCESS, dir_wd, Basename(file.path())),
+                          Event(IN_ACCESS, file_wd),
+                          Event(IN_MODIFY, dir_wd, Basename(file.path())),
+                          Event(IN_MODIFY, file_wd),
+                      }));
+}
+
+// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for
+// children that have already been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlink_NoRandomSave) {
+  const DisableSave ds;
+  // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // Unlink the child, which should cause further operations on the open file
+  // descriptor to be ignored.
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+  int val = 0;
+  ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))}));
+}
+
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  TempPath dir =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+  std::string dirPath = dir.path();
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // Unlink the dir, and then close the open fd.
+  ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds());
+  dir.reset();
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  // No close event should appear.
+  ASSERT_THAT(events,
+              Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))}));
+}
+
+// If "dir/child" and "dir/child2" are links to the same file, and "dir/child"
+// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events
+// for fds on "dir/child" but not "dir/child2".
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
+  const DisableSave ds;
+  // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+  std::string path1 = file.path();
+  std::string path2 = NewTempAbsPathInDir(dir.path());
+
+  const int rc = link(path1.c_str(), path2.c_str());
+  // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+  SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+          (errno == EPERM || errno == ENOENT));
+  ASSERT_THAT(rc, SyscallSucceeds());
+  const FileDescriptor fd1 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR));
+  const FileDescriptor fd2 =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR));
+
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // After unlinking path1, only events on the fd for path2 should be generated.
+  ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds());
+  ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds());
+  ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds());
+
+  const std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_DELETE, wd, Basename(path1)),
+                          Event(IN_MODIFY, wd, Basename(path2)),
+                      }));
+}
+
+// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected
+// by IN_EXCL_UNLINK (see
+// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level
+// events include changes to metadata and extended attributes.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
+  const DisableSave ds;
+
+  const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+  const TempPath file =
+      ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+  const FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR));
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+      inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+  // NOTE(b/157163751): Create another link before unlinking. This is needed for
+  // the gofer filesystem in gVisor, where open fds will not work once the link
+  // count hits zero. In VFS2, we end up skipping the gofer test anyway, because
+  // hard links are not supported for gofer fs.
+  if (IsRunningOnGvisor()) {
+    std::string link_path = NewTempAbsPath();
+    const int rc = link(file.path().c_str(), link_path.c_str());
+    // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+    SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT));
+    ASSERT_THAT(rc, SyscallSucceeds());
+  }
+
+  // Even after unlinking, inode-level operations will trigger events regardless
+  // of IN_EXCL_UNLINK.
+  ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+
+  // Perform various actions on fd.
+  ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_DELETE, wd, Basename(file.path())),
+                          Event(IN_MODIFY, wd, Basename(file.path())),
+                      }));
+
+  struct timeval times[2] = {{1, 0}, {2, 0}};
+  ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+
+  // S/R is disabled on this entire test due to behavior with unlink; it must
+  // also be disabled after this point because of fchmod.
+  ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds());
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))}));
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
index dd981a278..e397d5f57 100644
--- a/test/syscalls/linux/itimer.cc
+++ b/test/syscalls/linux/itimer.cc
@@ -267,8 +267,19 @@ int TestSIGPROFFairness(absl::Duration sleep) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
-  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
-  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+  // On the KVM and ptrace platforms, switches between sentry and application
+  // context are sometimes extremely slow, causing the itimer to send SIGPROF to
+  // a thread that either already has one pending or has had SIGPROF delivered,
+  // but hasn't handled it yet (and thus therefore still has SIGPROF masked). In
+  // either case, since itimer signals are group-directed, signal sending falls
+  // back to notifying the thread group leader. ItimerSignalTest() fails if "too
+  // many" signals are delivered to the thread group leader, so these tests are
+  // flaky on these platforms.
+  //
+  // TODO(b/143247272): Clarify why context switches are so slow on KVM.
+  const auto gvisor_platform = GvisorPlatform();
+  SKIP_IF(gvisor_platform == Platform::kKVM ||
+          gvisor_platform == Platform::kPtrace);
 
   pid_t child;
   int execve_errno;
@@ -291,8 +302,10 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
 // Random save/restore is disabled as it introduces additional latency and
 // unpredictable distribution patterns.
 TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
-  // TODO(b/143247272): CPU time accounting is inaccurate for the KVM platform.
-  SKIP_IF(GvisorPlatform() == Platform::kKVM);
+  // See comment in DeliversSIGPROFToThreadsRoughlyFairlyActive.
+  const auto gvisor_platform = GvisorPlatform();
+  SKIP_IF(gvisor_platform == Platform::kKVM ||
+          gvisor_platform == Platform::kPtrace);
 
   pid_t child;
   int execve_errno;
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
index 640fe6bfc..670c0284b 100644
--- a/test/syscalls/linux/open.cc
+++ b/test/syscalls/linux/open.cc
@@ -416,6 +416,29 @@ TEST_F(OpenTest, CanTruncateWriteOnlyNoReadPermission_NoRandomSave) {
   EXPECT_EQ(stat.st_size, 0);
 }
 
+TEST_F(OpenTest, CanTruncateWithStrangePermissions) {
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+  ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+  const DisableSave ds;  // Permissions are dropped.
+  std::string path = NewTempAbsPath();
+  int fd;
+  // Create a file without user permissions.
+  EXPECT_THAT(  // SAVE_BELOW
+      fd = open(path.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 055),
+      SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+
+  // Cannot open file because we are owner and have no permissions set.
+  EXPECT_THAT(open(path.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+
+  // We *can* chmod the file, because we are the owner.
+  EXPECT_THAT(chmod(path.c_str(), 0755), SyscallSucceeds());
+
+  // Now we can open the file again.
+  EXPECT_THAT(fd = open(path.c_str(), O_RDWR), SyscallSucceeds());
+  EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/ping_socket.cc b/test/syscalls/linux/ping_socket.cc
new file mode 100644
index 000000000..a9bfdb37b
--- /dev/null
+++ b/test/syscalls/linux/ping_socket.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class PingSocket : public ::testing::Test {
+ protected:
+  // Creates a socket to be used in tests.
+  void SetUp() override;
+
+  // Closes the socket created by SetUp().
+  void TearDown() override;
+
+  // The loopback address.
+  struct sockaddr_in addr_;
+};
+
+void PingSocket::SetUp() {
+  // On some hosts ping sockets are restricted to specific groups using the
+  // sysctl "ping_group_range".
+  int s = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+  if (s < 0 && errno == EPERM) {
+    GTEST_SKIP();
+  }
+  close(s);
+
+  addr_ = {};
+  // Just a random port as the destination port number is irrelevant for ping
+  // sockets.
+  addr_.sin_port = 12345;
+  addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  addr_.sin_family = AF_INET;
+}
+
+void PingSocket::TearDown() {}
+
+// Test ICMP port exhaustion returns EAGAIN.
+//
+// We disable both random/cooperative S/R for this test as it makes way too many
+// syscalls.
+TEST_F(PingSocket, ICMPPortExhaustion_NoRandomSave) {
+  DisableSave ds;
+  std::vector<FileDescriptor> sockets;
+  constexpr int kSockets = 65536;
+  addr_.sin_port = 0;
+  for (int i = 0; i < kSockets; i++) {
+    auto s =
+        ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+    int ret = connect(s.get(), reinterpret_cast<struct sockaddr*>(&addr_),
+                      sizeof(addr_));
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+    break;
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 1e35a4a8b..7a316427d 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -259,9 +259,9 @@ TEST_F(PollTest, Nfds) {
   TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
 
   // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
-  constexpr rlim_t gVisorMax = 1048576;
-  if (rlim.rlim_cur > gVisorMax) {
-    rlim.rlim_cur = gVisorMax;
+  constexpr rlim_t maxFD = 4096;
+  if (rlim.rlim_cur > maxFD) {
+    rlim.rlim_cur = maxFD;
     TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
   }
 
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
index b8a0159ba..aabfa6955 100644
--- a/test/syscalls/linux/pty.cc
+++ b/test/syscalls/linux/pty.cc
@@ -364,6 +364,12 @@ PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
     ssize_t n =
         ReadFd(fd, static_cast<char*>(buf) + completed, count - completed);
     if (n < 0) {
+      if (errno == EAGAIN) {
+        // Linux sometimes returns EAGAIN from this read, despite the fact that
+        // poll returned success. Let's just do what do as we are told and try
+        // again.
+        continue;
+      }
       return PosixError(errno, "read failed");
     }
     completed += n;
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
index afa59c1da..e0a4d0985 100644
--- a/test/syscalls/linux/socket.cc
+++ b/test/syscalls/linux/socket.cc
@@ -62,9 +62,7 @@ TEST(SocketTest, ProtocolInet) {
 }
 
 TEST(SocketTest, UnixSocketStat) {
-  // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It
-  // should pass in VFS2.
-  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(IsRunningWithVFS1());
 
   FileDescriptor bound =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
@@ -94,9 +92,7 @@ TEST(SocketTest, UnixSocketStat) {
 }
 
 TEST(SocketTest, UnixConnectNeedsWritePerm) {
-  // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It
-  // should succeed in VFS2.
-  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(IsRunningWithVFS1());
 
   FileDescriptor bound =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
@@ -128,10 +124,7 @@ using SocketOpenTest = ::testing::TestWithParam<int>;
 // UDS cannot be opened.
 TEST_P(SocketOpenTest, Unix) {
   // FIXME(b/142001530): Open incorrectly succeeds on gVisor.
-  //
-  // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It
-  // should succeed in VFS2.
-  SKIP_IF(IsRunningOnGvisor());
+  SKIP_IF(IsRunningWithVFS1());
 
   FileDescriptor bound =
       ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index 1967329ee..4b2cb0606 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -363,8 +363,9 @@ TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
 }
 
 TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
+  // FIXME(b/158253835): Support SO_REUSEADDR on TCP sockets.
+  SKIP_IF(IsRunningOnGvisor() &&
+          (GetParam().type & SOCK_STREAM) == SOCK_STREAM);
 
   ASSERT_NO_FATAL_FAILURE(
       BindSocket(/* reusePort */ false, /* reuse_addr */ true));
@@ -405,8 +406,9 @@ TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
 }
 
 TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
+  // FIXME(b/158253835): Support SO_REUSEADDR on TCP sockets.
+  SKIP_IF(IsRunningOnGvisor() &&
+          (GetParam().type & SOCK_STREAM) == SOCK_STREAM);
 
   ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
                                      /* reuse_addr */ true,
@@ -434,8 +436,9 @@ TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
 }
 
 TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
+  // FIXME(b/158253835): Support SO_REUSEADDR on TCP sockets.
+  SKIP_IF(IsRunningOnGvisor() &&
+          (GetParam().type & SOCK_STREAM) == SOCK_STREAM);
 
   ASSERT_NO_FATAL_FAILURE(BindSocket(
       /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
@@ -469,10 +472,20 @@ TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
                                      /* bind_to_device */ 0, EADDRINUSE));
 }
 
-// This behavior seems like a bug?
 TEST_P(BindToDeviceSequenceTest,
        BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+  // The behavior described in this test seems like a Linux bug. It doesn't
+  // make any sense and it is unlikely that any applications rely on it.
+  //
+  // Both SO_REUSEADDR and SO_REUSEPORT allow binding multiple UDP sockets to
+  // the same address and deliver each packet to exactly one of the bound
+  // sockets. If both are enabled, one of the strategies is selected to route
+  // packets. The strategy is selected dynamically based on the settings of the
+  // currently bound sockets. Usually, the strategy is selected based on the
+  // common setting (SO_REUSEADDR or SO_REUSEPORT) amongst the sockets, but for
+  // some reason, Linux allows binding sets of sockets with no overlapping
+  // settings in some situations. In this case, it is not obvious which strategy
+  // would be selected as the configured setting is a contradiction.
   SKIP_IF(IsRunningOnGvisor());
 
   ASSERT_NO_FATAL_FAILURE(BindSocket(
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index fa890ec98..a914d9a12 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1900,9 +1900,6 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
   auto const& param = GetParam();
 
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
-
   // Bind the v6 loopback on a dual stack socket.
   TestAddress const& test_addr = V6Loopback();
   sockaddr_storage bound_addr = test_addr.addr;
@@ -2091,9 +2088,6 @@ TEST_P(SocketMultiProtocolInetLoopbackTest,
        V4MappedEphemeralPortReservedResueAddr) {
   auto const& param = GetParam();
 
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
-
   // Bind the v4 loopback on a dual stack socket.
   TestAddress const& test_addr = V4MappedLoopback();
   sockaddr_storage bound_addr = test_addr.addr;
@@ -2283,9 +2277,6 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
 TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
   auto const& param = GetParam();
 
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor() && param.type == SOCK_DGRAM);
-
   // Bind the v4 loopback on a v4 socket.
   TestAddress const& test_addr = V4Loopback();
   sockaddr_storage bound_addr = test_addr.addr;
diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
new file mode 100644
index 000000000..2324c7f6a
--- /dev/null
+++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
@@ -0,0 +1,171 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <string.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Gt;
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+  switch (family) {
+    case AF_INET:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+    case AF_INET6:
+      return static_cast<uint16_t>(
+          reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+  switch (family) {
+    case AF_INET:
+      reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+      return NoError();
+    case AF_INET6:
+      reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+      return NoError();
+    default:
+      return PosixError(EINVAL,
+                        absl::StrCat("unknown socket family: ", family));
+  }
+}
+
+struct TestParam {
+  TestAddress listener;
+  TestAddress connector;
+};
+
+std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) {
+  return absl::StrCat("Listen", info.param.listener.description, "_Connect",
+                      info.param.connector.description);
+}
+
+using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>;
+
+// This test verifies that connect returns EADDRNOTAVAIL if all local ephemeral
+// ports are already in use for a given destination ip/port.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(SocketInetLoopbackTest, TestTCPPortExhaustion_NoRandomSave) {
+  auto const& param = GetParam();
+  TestAddress const& listener = param.listener;
+  TestAddress const& connector = param.connector;
+
+  constexpr int kBacklog = 10;
+  constexpr int kClients = 65536;
+
+  // Create the listening socket.
+  auto listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+      Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+  sockaddr_storage listen_addr = listener.addr;
+  ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+                   listener.addr_len),
+              SyscallSucceeds());
+  ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+  // Get the port bound by the listening socket.
+  socklen_t addrlen = listener.addr_len;
+  ASSERT_THAT(getsockname(listen_fd.get(),
+                          reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+              SyscallSucceeds());
+  uint16_t const port =
+      ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+
+  // Now we keep opening connections till we run out of local ephemeral ports.
+  // and assert the error we get back.
+  sockaddr_storage conn_addr = connector.addr;
+  ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+  std::vector<FileDescriptor> clients;
+  std::vector<FileDescriptor> servers;
+
+  for (int i = 0; i < kClients; i++) {
+    FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+        Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+    int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+                      connector.addr_len);
+    if (ret == 0) {
+      clients.push_back(std::move(client));
+      FileDescriptor server =
+          ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+      servers.push_back(std::move(server));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRNOTAVAIL));
+    break;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    All, SocketInetLoopbackTest,
+    ::testing::Values(
+        // Listeners bound to IPv4 addresses refuse connections using IPv6
+        // addresses.
+        TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()},
+        TestParam{V4Any(), V4MappedAny()},
+        TestParam{V4Any(), V4MappedLoopback()},
+        TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()},
+        TestParam{V4Loopback(), V4MappedLoopback()},
+        TestParam{V4MappedAny(), V4Any()},
+        TestParam{V4MappedAny(), V4Loopback()},
+        TestParam{V4MappedAny(), V4MappedAny()},
+        TestParam{V4MappedAny(), V4MappedLoopback()},
+        TestParam{V4MappedLoopback(), V4Any()},
+        TestParam{V4MappedLoopback(), V4Loopback()},
+        TestParam{V4MappedLoopback(), V4MappedLoopback()},
+
+        // Listeners bound to IN6ADDR_ANY accept all connections.
+        TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()},
+        TestParam{V6Any(), V4MappedAny()},
+        TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()},
+        TestParam{V6Any(), V6Loopback()},
+
+        // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+        // addresses.
+        TestParam{V6Loopback(), V6Any()},
+        TestParam{V6Loopback(), V6Loopback()}),
+    DescribeTestParam);
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index fa81845fd..15adc8d0e 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -524,6 +524,7 @@ TEST_P(TCPSocketPairTest, SetTCPKeepintvlZero) {
 // Copied from include/net/tcp.h.
 constexpr int MAX_TCP_KEEPIDLE = 32767;
 constexpr int MAX_TCP_KEEPINTVL = 32767;
+constexpr int MAX_TCP_KEEPCNT = 127;
 
 TEST_P(TCPSocketPairTest, SetTCPKeepidleAboveMax) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
@@ -575,6 +576,78 @@ TEST_P(TCPSocketPairTest, SetTCPKeepintvlToMax) {
   EXPECT_EQ(get, MAX_TCP_KEEPINTVL);
 }
 
+TEST_P(TCPSocketPairTest, TCPKeepcountDefault) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, 9);  // 9 keepalive probes.
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountZero) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kZero = 0;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &kZero,
+                         sizeof(kZero)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountAboveMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  constexpr int kAboveMax = MAX_TCP_KEEPCNT + 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+                         &kAboveMax, sizeof(kAboveMax)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToMax) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+                         &MAX_TCP_KEEPCNT, sizeof(MAX_TCP_KEEPCNT)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, MAX_TCP_KEEPCNT);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToOne) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int keepaliveCount = 1;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+                         &keepaliveCount, sizeof(keepaliveCount)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_len = sizeof(get);
+  EXPECT_THAT(
+      getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get_len, sizeof(get));
+  EXPECT_EQ(get, keepaliveCount);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToNegative) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  int keepaliveCount = -5;
+  EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+                         &keepaliveCount, sizeof(keepaliveCount)),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 TEST_P(TCPSocketPairTest, SetOOBInline) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 1c533fdf2..edb86aded 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -225,9 +225,6 @@ TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
 TEST_P(UDPSocketPairTest, SetReuseAddr) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
@@ -292,9 +289,6 @@ TEST_P(UDPSocketPairTest, SetReusePort) {
 TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
                          &kSockOptOn, sizeof(kSockOptOn)),
               SyscallSucceeds());
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index ca597e267..5536ab53a 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -157,7 +157,7 @@ TEST_P(IPUnboundSocketTest, TOSDefault) {
   int get = -1;
   socklen_t get_sz = sizeof(get);
   constexpr int kDefaultTOS = 0;
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, kDefaultTOS);
@@ -173,7 +173,7 @@ TEST_P(IPUnboundSocketTest, SetTOS) {
 
   int get = -1;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, set);
@@ -188,7 +188,7 @@ TEST_P(IPUnboundSocketTest, ZeroTOS) {
               SyscallSucceedsWithValue(0));
   int get = -1;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, set);
@@ -210,7 +210,7 @@ TEST_P(IPUnboundSocketTest, InvalidLargeTOS) {
   }
   int get = -1;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, kDefaultTOS);
@@ -229,7 +229,7 @@ TEST_P(IPUnboundSocketTest, CheckSkipECN) {
   }
   int get = -1;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, expect);
@@ -249,7 +249,7 @@ TEST_P(IPUnboundSocketTest, ZeroTOSOptionSize) {
   }
   int get = -1;
   socklen_t get_sz = 0;
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, 0);
   EXPECT_EQ(get, -1);
@@ -276,7 +276,7 @@ TEST_P(IPUnboundSocketTest, SmallTOSOptionSize) {
     }
     uint get = -1;
     socklen_t get_sz = i;
-    EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+    ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
                 SyscallSucceedsWithValue(0));
     EXPECT_EQ(get_sz, expect_sz);
     // Account for partial copies by getsockopt, retrieve the lower
@@ -297,7 +297,7 @@ TEST_P(IPUnboundSocketTest, LargeTOSOptionSize) {
     // We expect the system call handler to only copy atmost sizeof(int) bytes
     // as asserted by the check below. Hence, we do not expect the copy to
     // overflow in getsockopt.
-    EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+    ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
                 SyscallSucceedsWithValue(0));
     EXPECT_EQ(get_sz, sizeof(int));
     EXPECT_EQ(get, set);
@@ -325,7 +325,7 @@ TEST_P(IPUnboundSocketTest, NegativeTOS) {
   }
   int get = -1;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, expect);
@@ -338,20 +338,20 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
   TOSOption t = GetTOSOption(GetParam().domain);
   int expect;
   if (GetParam().domain == AF_INET) {
-    EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+    ASSERT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
                 SyscallSucceedsWithValue(0));
     expect = static_cast<uint8_t>(set);
     if (GetParam().protocol == IPPROTO_TCP) {
       expect &= ~INET_ECN_MASK;
     }
   } else {
-    EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+    ASSERT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
                 SyscallFailsWithErrno(EINVAL));
     expect = 0;
   }
   int get = 0;
   socklen_t get_sz = sizeof(get);
-  EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+  ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
               SyscallSucceedsWithValue(0));
   EXPECT_EQ(get_sz, sizeof(get));
   EXPECT_EQ(get, expect);
@@ -377,8 +377,11 @@ TEST_P(IPUnboundSocketTest, NullTOS) {
       //
       // Linux's implementation would need fixing as passing a nullptr as optval
       // and non-zero optlen may not be valid.
-      EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
-                  SyscallSucceedsWithValue(0));
+      // TODO(b/158666797): Combine the gVisor and linux cases for IPv6.
+      // Some kernel versions return EFAULT, so we handle both.
+      EXPECT_THAT(
+          setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+          AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(0)));
     }
   }
   socklen_t get_sz = sizeof(int);
@@ -419,6 +422,38 @@ TEST_P(IPUnboundSocketTest, InsufficientBufferTOS) {
   EXPECT_THAT(sendmsg(socket->get(), &msg, 0), SyscallFailsWithErrno(EINVAL));
 }
 
+TEST_P(IPUnboundSocketTest, ReuseAddrDefault) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  // FIXME(b/76031995): gVisor TCP sockets default to SO_REUSEADDR enabled.
+  SKIP_IF(IsRunningOnGvisor() &&
+          (GetParam().type & SOCK_STREAM) == SOCK_STREAM);
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOff);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
+TEST_P(IPUnboundSocketTest, SetReuseAddr) {
+  auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+  ASSERT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+                         sizeof(kSockOptOn)),
+              SyscallSucceedsWithValue(0));
+
+  int get = -1;
+  socklen_t get_sz = sizeof(get);
+  ASSERT_THAT(
+      getsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &get, &get_sz),
+      SyscallSucceedsWithValue(0));
+  EXPECT_EQ(get, kSockOptOn);
+  EXPECT_EQ(get_sz, sizeof(get));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IPUnboundSockets, IPUnboundSocketTest,
     ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index bc4b07a62..4db9553e1 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1672,10 +1672,10 @@ TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenSend) {
 }
 
 // Check that SO_REUSEADDR always delivers to the most recently bound socket.
-TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
+//
+// FIXME(gvisor.dev/issue/873): Endpoint order is not restored correctly. Enable
+// random and co-op save (below) once that is fixed.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution_NoRandomSave) {
   std::vector<std::unique_ptr<FileDescriptor>> sockets;
   sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
 
@@ -1696,6 +1696,9 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution) {
 
   constexpr int kMessageSize = 200;
 
+  // FIXME(gvisor.dev/issue/873): Endpoint order is not restored correctly.
+  const DisableSave ds;
+
   for (int i = 0; i < 10; i++) {
     // Add a new receiver.
     sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
@@ -1836,9 +1839,6 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) {
 }
 
 TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -1880,9 +1880,6 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
 }
 
 TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable1) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -1927,9 +1924,6 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable1) {
 }
 
 TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable2) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -2020,9 +2014,6 @@ TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReusePort) {
 }
 
 TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
-  // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
-  SKIP_IF(IsRunningOnGvisor());
-
   auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
   auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -2129,6 +2120,39 @@ TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
               SyscallSucceedsWithValue(kMessageSize));
 }
 
+// Check that connect returns EADDRNOTAVAIL when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
+  auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+  constexpr int kClients = 65536;
+  // Bind the first socket to the loopback and take note of the selected port.
+  auto addr = V4Loopback();
+  ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                   addr.addr_len),
+              SyscallSucceeds());
+  socklen_t addr_len = addr.addr_len;
+  ASSERT_THAT(getsockname(receiver1->get(),
+                          reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+              SyscallSucceeds());
+  EXPECT_EQ(addr_len, addr.addr_len);
+
+  // Disable cooperative S/R as we are making too many syscalls.
+  DisableSave ds;
+  std::vector<std::unique_ptr<FileDescriptor>> sockets;
+  for (int i = 0; i < kClients; i++) {
+    auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+    int ret = connect(s->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+                      addr.addr_len);
+    if (ret == 0) {
+      sockets.push_back(std::move(s));
+      continue;
+    }
+    ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+    break;
+  }
+}
+
 // Test that socket will receive packet info control message.
 TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
   // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 8bf663e8b..591cab3fd 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -256,10 +256,9 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) {
 }
 
 TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
-  // TODO(b/122310852): We should be returning ENXIO and NOT EIO.
-  // TODO(github.dev/issue/1624): This should be resolved in VFS2. Verify
-  // that this is the case and delete the SKIP_IF once we delete VFS1.
-  SKIP_IF(IsRunningOnGvisor());
+  // TODO(gvisor.dev/issue/1624): In VFS1, we return EIO instead of ENXIO (see
+  // b/122310852). Remove this skip once VFS1 is deleted.
+  SKIP_IF(IsRunningWithVFS1());
   auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
 
   // Opening a socket pair via /proc/self/fd/X is a ENXIO.
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
index 84d3a569e..6d03df4d9 100644
--- a/test/syscalls/linux/socket_unix_seqpacket.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -43,6 +43,24 @@ TEST_P(SeqpacketUnixSocketPairTest, ReadOneSideClosed) {
               SyscallSucceedsWithValue(0));
 }
 
+TEST_P(SeqpacketUnixSocketPairTest, Sendto) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  constexpr char kPath[] = "\0nonexistent";
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(sendto(sockets->second_fd(), kStr, 3, 0, (struct sockaddr*)&addr,
+                     sizeof(addr)),
+              SyscallSucceedsWithValue(3));
+
+  char data[10] = {};
+  ASSERT_THAT(read(sockets->first_fd(), data, sizeof(data)),
+              SyscallSucceedsWithValue(3));
+}
+
 }  // namespace
 
 }  // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 563467365..99e77b89e 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -89,6 +89,20 @@ TEST_P(StreamUnixSocketPairTest, ReadOneSideClosedWithUnreadData) {
               SyscallFailsWithErrno(ECONNRESET));
 }
 
+TEST_P(StreamUnixSocketPairTest, Sendto) {
+  auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+  struct sockaddr_un addr = {};
+  addr.sun_family = AF_UNIX;
+  constexpr char kPath[] = "\0nonexistent";
+  memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+  constexpr char kStr[] = "abc";
+  ASSERT_THAT(sendto(sockets->second_fd(), kStr, 3, 0, (struct sockaddr*)&addr,
+                     sizeof(addr)),
+              SyscallFailsWithErrno(EISCONN));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllUnixDomainSockets, StreamUnixSocketPairTest,
     ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
index f103e2e56..08fc4b1b7 100644
--- a/test/syscalls/linux/splice.cc
+++ b/test/syscalls/linux/splice.cc
@@ -430,6 +430,55 @@ TEST(SpliceTest, TwoPipes) {
   EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
 }
 
+TEST(SpliceTest, TwoPipesCircular) {
+  // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is
+  // based on fs.File.UniqueID, which does not prevent circular ordering between
+  // e.g. inode-level locks taken by fs.FileOperations.
+  SKIP_IF(IsRunningWithVFS1());
+
+  // Create two pipes.
+  int fds[2];
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor first_rfd(fds[0]);
+  const FileDescriptor first_wfd(fds[1]);
+  ASSERT_THAT(pipe(fds), SyscallSucceeds());
+  const FileDescriptor second_rfd(fds[0]);
+  const FileDescriptor second_wfd(fds[1]);
+
+  // On Linux, each pipe is normally limited to
+  // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data.
+  constexpr size_t PIPE_DEF_BUFFERS = 16;
+
+  // Write some data to each pipe. Below we splice 1 byte at a time between
+  // pipes, which very quickly causes each byte to be stored in a separate
+  // buffer, so we must ensure that the total amount of data in the system is <=
+  // PIPE_DEF_BUFFERS bytes.
+  std::vector<char> buf(PIPE_DEF_BUFFERS / 2);
+  RandomizeBuffer(buf.data(), buf.size());
+  ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+  ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()),
+              SyscallSucceedsWithValue(buf.size()));
+
+  // Have another thread splice from the second pipe to the first, while we
+  // splice from the first to the second. The test passes if this does not
+  // deadlock.
+  const int kIterations = 1000;
+  DisableSave ds;
+  ScopedThread t([&]() {
+    for (int i = 0; i < kIterations; i++) {
+      ASSERT_THAT(
+          splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0),
+          SyscallSucceedsWithValue(1));
+    }
+  });
+  for (int i = 0; i < kIterations; i++) {
+    ASSERT_THAT(
+        splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0),
+        SyscallSucceedsWithValue(1));
+  }
+}
+
 TEST(SpliceTest, Blocking) {
   // Create two new pipes.
   int first[2], second[2];
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
index 6195b11e1..97d554e72 100644
--- a/test/syscalls/linux/tuntap.cc
+++ b/test/syscalls/linux/tuntap.cc
@@ -398,5 +398,25 @@ TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
   }
 }
 
+// Write hang bug found by syskaller: b/155928773
+// https://syzkaller.appspot.com/bug?id=065b893bd8d1d04a4e0a1d53c578537cde1efe99
+TEST_F(TuntapTest, WriteHangBug155928773) {
+  SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+  FileDescriptor fd =
+      ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+  int sock = socket(AF_INET, SOCK_DGRAM, 0);
+  ASSERT_THAT(sock, SyscallSucceeds());
+
+  struct sockaddr_in remote = {};
+  remote.sin_family = AF_INET;
+  remote.sin_port = htons(42);
+  inet_pton(AF_INET, "10.0.0.1", &remote.sin_addr);
+  // Return values do not matter in this test.
+  connect(sock, reinterpret_cast<struct sockaddr*>(&remote), sizeof(remote));
+  write(sock, "hello", 5);
+}
+
 }  // namespace testing
 }  // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
index 740c7986d..42521efef 100644
--- a/test/syscalls/linux/udp_socket_test_cases.cc
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -17,6 +17,7 @@
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include <netinet/in.h>
+#include <poll.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -673,6 +674,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
   char buf[3];
   // Send zero length packet from s_ to t_.
   ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {t_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // Receive the packet.
   char received[3];
   EXPECT_THAT(read(t_, received, sizeof(received)),
@@ -698,6 +704,11 @@ TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
   char buf[3];
   // Send zero length packet from s_ to t_.
   ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+
+  struct pollfd pfd = {t_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // Receive the packet.
   char received[3];
   EXPECT_THAT(read(t_, received, sizeof(received)),
@@ -859,6 +870,10 @@ TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
 
   EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // We should get the data even though read has been shutdown.
   EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
 
@@ -1112,6 +1127,10 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
   ASSERT_THAT(send(s_, str, sizeof(str), 0),
               SyscallSucceedsWithValue(sizeof(str)));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   n = -1;
   EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
   EXPECT_EQ(n, sizeof(str));
@@ -1123,6 +1142,8 @@ TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
   EXPECT_EQ(n, sizeof(str));
 }
 
+// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the
+// corresponding macro and become `0x541B`.
 TEST_P(UdpSocketTest, Fionread) {
   // Bind s_ to loopback:TestPort.
   ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
@@ -1138,10 +1159,14 @@ TEST_P(UdpSocketTest, Fionread) {
   char buf[3 * psize];
   RandomizeBuffer(buf, sizeof(buf));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
   for (int i = 0; i < 3; ++i) {
     ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
                 SyscallSucceedsWithValue(psize));
 
+    ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                SyscallSucceedsWithValue(1));
+
     // Check that regardless of how many packets are in the queue, the size
     // reported is that of a single packet.
     n = -1;
@@ -1165,10 +1190,18 @@ TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
   char buf[3 * psize];
   RandomizeBuffer(buf, sizeof(buf));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
   for (int i = 0; i < 3; ++i) {
     ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
                 SyscallSucceedsWithValue(0));
 
+    // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet
+    // socket does not cause a poll event to be triggered.
+    if (!IsRunningWithHostinet()) {
+      ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+                  SyscallSucceedsWithValue(1));
+    }
+
     // Check that regardless of how many packets are in the queue, the size
     // reported is that of a single packet.
     n = -1;
@@ -1235,6 +1268,10 @@ TEST_P(UdpSocketTest, SoTimestamp) {
   // Send zero length packet from t_ to s_.
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
   msghdr msg;
   memset(&msg, 0, sizeof(msg));
@@ -1278,6 +1315,10 @@ TEST_P(UdpSocketTest, TimestampIoctl) {
   ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
               SyscallSucceedsWithValue(sizeof(buf)));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be no control messages.
   char recv_buf[sizeof(buf)];
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
@@ -1315,6 +1356,10 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
               SyscallSucceedsWithValue(sizeof(buf)));
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  struct pollfd pfd = {s_, POLLIN, 0};
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be no control messages.
   char recv_buf[sizeof(buf)];
   ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
@@ -1330,6 +1375,9 @@ TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
               SyscallSucceeds());
   ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
 
+  ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+              SyscallSucceedsWithValue(1));
+
   // There should be a message for SO_TIMESTAMP.
   char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
   msghdr msg = {};
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 95e1e0c96..b20758626 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -42,12 +42,13 @@ namespace testing {
 
 #define TEST_ON_GVISOR "TEST_ON_GVISOR"
 #define GVISOR_NETWORK "GVISOR_NETWORK"
+#define GVISOR_VFS "GVISOR_VFS"
 
 bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
 
 const std::string GvisorPlatform() {
   // Set by runner.go.
-  char* env = getenv(TEST_ON_GVISOR);
+  const char* env = getenv(TEST_ON_GVISOR);
   if (!env) {
     return Platform::kNative;
   }
@@ -55,10 +56,19 @@ const std::string GvisorPlatform() {
 }
 
 bool IsRunningWithHostinet() {
-  char* env = getenv(GVISOR_NETWORK);
+  const char* env = getenv(GVISOR_NETWORK);
   return env && strcmp(env, "host") == 0;
 }
 
+bool IsRunningWithVFS1() {
+  const char* env = getenv(GVISOR_VFS);
+  if (env == nullptr) {
+    // If not set, it's running on Linux.
+    return false;
+  }
+  return strcmp(env, "VFS1") == 0;
+}
+
 // Inline cpuid instruction.  Preserve %ebx/%rbx register. In PIC compilations
 // %ebx contains the address of the global offset table. %rbx is occasionally
 // used to address stack variables in presence of dynamic allocas.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index c5cb9d6d6..8e3245b27 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -220,6 +220,7 @@ constexpr char kKVM[] = "kvm";
 bool IsRunningOnGvisor();
 const std::string GvisorPlatform();
 bool IsRunningWithHostinet();
+bool IsRunningWithVFS1();
 
 #ifdef __linux__
 void SetupGvisorDeathTest();
diff --git a/tools/bazel.mk b/tools/bazel.mk
index 7cb6e393b..9f4a40669 100644
--- a/tools/bazel.mk
+++ b/tools/bazel.mk
@@ -21,7 +21,8 @@ BRANCH_NAME := $(shell (git branch --show-current 2>/dev/null || \
 
 # Bazel container configuration (see below).
 USER ?= gvisor
-DOCKER_NAME ?= gvisor-bazel-$(shell readlink -m $(CURDIR) | md5sum | cut -c1-8)
+HASH ?= $(shell readlink -m $(CURDIR) | md5sum | cut -c1-8)
+DOCKER_NAME ?= gvisor-bazel-$(HASH)
 DOCKER_PRIVILEGED ?= --privileged
 BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
 GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
@@ -40,6 +41,7 @@ FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
 DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET))
 ifneq ($(GID),$(DOCKER_GROUP))
 USERADD_OPTIONS += --groups $(DOCKER_GROUP)
+GROUPADD_DOCKER += groupadd --gid $(DOCKER_GROUP) --non-unique docker-$(HASH) &&
 FULL_DOCKER_RUN_OPTIONS += --group-add $(DOCKER_GROUP)
 endif
 endif
@@ -71,10 +73,12 @@ bazel-server-start: load-default ## Starts the bazel server.
 		$(FULL_DOCKER_RUN_OPTIONS) \
 		gvisor.dev/images/default \
 		sh -c "groupadd --gid $(GID) --non-unique $(USER) && \
+		       $(GROUPADD_DOCKER) \
 		       useradd --uid $(UID) --non-unique --no-create-home --gid $(GID) $(USERADD_OPTIONS) -d $(HOME) $(USER) && \
 	               bazel version && \
 		       exec tail --pid=\$$(bazel info server_pid) -f /dev/null"
-	@while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; sleep 1; done
+	@while :; do if docker logs $(DOCKER_NAME) 2>/dev/null | grep "Build label:" >/dev/null; then break; fi; \
+		if ! docker ps | grep $(DOCKER_NAME); then exit 1; else sleep 1; fi; done
 .PHONY: bazel-server-start
 
 bazel-shutdown: ## Shuts down a running bazel server.
@@ -89,14 +93,16 @@ bazel-server: ## Ensures that the server exists. Used as an internal target.
 	@docker exec $(DOCKER_NAME) true || $(MAKE) bazel-server-start
 .PHONY: bazel-server
 
-build_paths = docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) sh -o pipefail -c 'bazel build $(OPTIONS) $(TARGETS) 2>&1 \
-		| tee /dev/fd/2 \
+build_cmd = docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) sh -o pipefail -c 'bazel $(STARTUP_OPTIONS) build $(OPTIONS) $(TARGETS)'
+
+build_paths = $(build_cmd) 2>&1 \
+		| tee /proc/self/fd/2 \
 		| grep -E "^  bazel-bin/" \
-		| awk "{print $$1;}"' \
+		| awk "{print $$1;}" \
 		| xargs -n 1 -I {} sh -c "$(1)"
 
 build: bazel-server
-	@$(call build_paths,echo {})
+	@$(call build_cmd)
 .PHONY: build
 
 copy: bazel-server
@@ -114,5 +120,5 @@ sudo: bazel-server
 .PHONY: sudo
 
 test: bazel-server
-	@docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel test $(OPTIONS) $(TARGETS)
+	@docker exec --user $(UID):$(GID) -i $(DOCKER_NAME) bazel $(STARTUP_OPTIONS) test $(OPTIONS) $(TARGETS)
 .PHONY: test
diff --git a/tools/defs.bzl b/tools/defs.bzl
index 41eded16d..0bd468786 100644
--- a/tools/defs.bzl
+++ b/tools/defs.bzl
@@ -96,6 +96,7 @@ def go_imports(name, src, out):
         cmd = ("$(location @org_golang_x_tools//cmd/goimports:goimports) $(SRCS) > $@"),
     )
 
+# TODO(b/158696872): Enable nogo by default.
 def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, marshal_debug = False, nogo = False, **kwargs):
     """Wraps the standard go_library and does stateification and marshalling.
 
diff --git a/tools/go_branch.sh b/tools/go_branch.sh
index f97a74aaf..093de89b4 100755
--- a/tools/go_branch.sh
+++ b/tools/go_branch.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -eo pipefail
+set -xeo pipefail
 
 # Discovery the package name from the go.mod file.
 declare -r module=$(cat go.mod | grep -E "^module" | cut -d' ' -f2)
@@ -42,7 +42,8 @@ declare -r head=$(git describe --always)
 
 # We expect to have an existing go branch that we will use as the basis for
 # this commit. That branch may be empty, but it must exist.
-declare -r go_branch=$(git show-ref --hash origin/go)
+git fetch --all
+declare -r go_branch=$(git show-ref --hash go)
 
 # Clone the current repository to the temporary directory, and check out the
 # current go_branch directory. We move to the new repository for convenience.
@@ -87,6 +88,12 @@ EOF
 # because they may correspond to unused templates, etc.
 cp "${repo_orig}"/runsc/*.go runsc/
 
+# Normalize all permissions. The way bazel constructs the :gopath tree may leave
+# some strange permissions on files. We don't have anything in this tree that
+# should be execution, only the Go source files, README.md, and ${othersrc}.
+find . -type f -exec chmod 0644 {} \;
+find . -type d -exec chmod 0755 {} \;
+
 # Update the current working set and commit.
 git add . && git commit -m "Merge ${head} (automated)"
 
diff --git a/tools/go_generics/generics.go b/tools/go_generics/generics.go
index e9cc2c753..0860ca9db 100644
--- a/tools/go_generics/generics.go
+++ b/tools/go_generics/generics.go
@@ -223,7 +223,9 @@ func main() {
 		} else {
 			switch kind {
 			case globals.KindType, globals.KindVar, globals.KindConst, globals.KindFunction:
-				ident.Name = *prefix + ident.Name + *suffix
+				if ident.Name != "_" {
+					ident.Name = *prefix + ident.Name + *suffix
+				}
 			case globals.KindTag:
 				// Modify the state tag appropriately.
 				if m := stateTagRegexp.FindStringSubmatch(ident.Name); m != nil {
diff --git a/tools/go_generics/globals/scope.go b/tools/go_generics/globals/scope.go
index 96c965ea2..eec93534b 100644
--- a/tools/go_generics/globals/scope.go
+++ b/tools/go_generics/globals/scope.go
@@ -72,6 +72,10 @@ func (s *scope) deepLookup(n string) *symbol {
 }
 
 func (s *scope) add(name string, kind SymKind, pos token.Pos) {
+	if s.syms[name] != nil {
+		return
+	}
+
 	s.syms[name] = &symbol{
 		kind:  kind,
 		pos:   pos,
diff --git a/tools/make_repository.sh b/tools/make_apt.sh
index 32d7b3b1f..3fb1066e5 100755
--- a/tools/make_repository.sh
+++ b/tools/make_apt.sh
@@ -14,22 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# We need to be sure that only a repo path is printed on stdout.
-exec 50<&1
-exec 1<&2
-
-echo_stdout() {
-  echo "$@" >&50
-}
-
-# Parse arguments. We require more than two arguments, which are the private
-# keyring, the e-mail associated with the signer, and the list of packages.
-if [ "$#" -le 3 ]; then
-  echo "usage: $0 <private-key> <signer-email> <root> <packages...>"
+if [[ "$#" -le 3 ]]; then
+  echo "usage: $0 <private-key> <suite> <root> <packages...>"
   exit 1
 fi
 declare -r private_key=$(readlink -e "$1"); shift
-declare -r signer="$1"; shift
+declare -r suite="$1"; shift
 declare -r root="$1"; shift
 
 # Ensure that we have the correct packages installed.
@@ -52,16 +42,16 @@ function apt_install() {
     esac
   done
 }
-dpkg-sig --help >/dev/null       || apt_install dpkg-sig
-apt-ftparchive --help >/dev/null || apt_install apt-utils
-xz --help >/dev/null             || apt_install xz-utils
+dpkg-sig --help >/dev/null 2>&1       || apt_install dpkg-sig
+apt-ftparchive --help >/dev/null 2>&1 || apt_install apt-utils
+xz --help >/dev/null 2>&1             || apt_install xz-utils
 
 # Verbose from this point.
 set -xeo pipefail
 
-# Create a temporary working directory. We don't remove this, as we ultimately
-# print this result and allow the caller to copy wherever they would like.
-declare -r tmpdir=$(mktemp -d /tmp/repoXXXXXX)
+# Create a directory for the release.
+declare -r release="${root}/dists/${suite}"
+mkdir -p "${release}"
 
 # Create a temporary keyring, and ensure it is cleaned up.
 declare -r keyring=$(mktemp /tmp/keyringXXXXXX.gpg)
@@ -69,12 +59,18 @@ cleanup() {
   rm -f "${keyring}"
 }
 trap cleanup EXIT
-gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
+
+# We attempt the import twice because the first one will fail if the public key
+# is not found. This isn't actually a failure for us, because we don't require
+# the public (this may be stored separately). The second import will succeed
+# because, in reality, the first import succeeded and it's a no-op.
+gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}" || \
+  gpg --no-default-keyring --keyring "${keyring}" --import "${private_key}"
 
 # Copy the packages into the root.
 for pkg in "$@"; do
-  name=$(basename "${pkg}" .deb)
-  name=$(basename "${name}" .changes)
+  ext=${pkg##*.}
+  name=$(basename "${pkg}" ".${ext}")
   arch=${name##*_}
   if [[ "${name}" == "${arch}" ]]; then
     continue # Not a regular package.
@@ -90,17 +86,22 @@ for pkg in "$@"; do
     echo "Unknown file type: ${pkg}"
     exit 1
   fi
-  version=${version// /} # Trim whitespace.
-  mkdir -p "${root}"/pool/"${version}"/binary-"${arch}"
-  cp -a "${pkg}" "${root}"/pool/"${version}"/binary-"${arch}"
-done
 
-# Ensure all permissions are correct.
-find "${root}"/pool -type f -exec chmod 0644 {} \;
+  # The package may already exist, in which case we leave it alone.
+  version=${version// /} # Trim whitespace.
+  destdir="${root}/pool/${version}/binary-${arch}"
+  target="${destdir}/${name}.${ext}"
+  if [[ -f "${target}" ]]; then
+    continue
+  fi
 
-# Sign all packages.
-for file in "${root}"/pool/*/binary-*/*.deb; do
-  dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${file}"
+  # Copy & sign the package.
+  mkdir -p "${destdir}"
+  cp -a "${pkg}" "${target}"
+  chmod 0644 "${target}"
+  if [[ "${ext}" == "deb" ]]; then
+    dpkg-sig -g "--no-default-keyring --keyring ${keyring}" --sign builder "${target}"
+  fi
 done
 
 # Build the package list.
@@ -109,7 +110,7 @@ for dir in "${root}"/pool/*/binary-*; do
   name=$(basename "${dir}")
   arch=${name##binary-}
   arches+=("${arch}")
-  repo_packages="${tmpdir}"/main/"${name}"
+  repo_packages="${release}"/main/"${name}"
   mkdir -p "${repo_packages}"
   (cd "${root}" && apt-ftparchive --arch "${arch}" packages pool > "${repo_packages}"/Packages)
   (cd "${repo_packages}" && cat Packages | gzip > Packages.gz)
@@ -117,23 +118,22 @@ for dir in "${root}"/pool/*/binary-*; do
 done
 
 # Build the release list.
-cat > "${tmpdir}"/apt.conf <<EOF
+cat > "${release}"/apt.conf <<EOF
 APT {
   FTPArchive {
     Release {
       Architectures "${arches[@]}";
+      Suite "${suite}";
       Components "main";
     };
   };
 };
 EOF
-(cd "${tmpdir}" && apt-ftparchive -c=apt.conf release . > Release)
-rm "${tmpdir}"/apt.conf
+(cd "${release}" && apt-ftparchive -c=apt.conf release . > Release)
+rm "${release}"/apt.conf
 
 # Sign the release.
 declare -r digest_opts=("--digest-algo" "SHA512" "--cert-digest-algo" "SHA512")
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
-(cd "${tmpdir}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
-
-# Show the results.
-echo_stdout "${tmpdir}"
+(cd "${release}" && rm -f Release.gpg InRelease)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" --clearsign "${digest_opts[@]}" -o InRelease Release)
+(cd "${release}" && gpg --no-default-keyring --keyring "${keyring}" -abs "${digest_opts[@]}" -o Release.gpg Release)
diff --git a/tools/make_release.sh b/tools/make_release.sh
new file mode 100755
index 000000000..b1cdd47b0
--- /dev/null
+++ b/tools/make_release.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ "$#" -le 2 ]]; then
+  echo "usage: $0 <private-key> <root> <binaries & packages...>"
+  echo "The environment variable NIGHTLY may be set to control"
+  echo "whether the nightly packages are produced or not."
+  exit 1
+fi
+
+set -xeo pipefail
+declare -r private_key="$1"; shift
+declare -r root="$1"; shift
+declare -a binaries
+declare -a pkgs
+
+# Collect binaries & packages.
+for arg in "$@"; do
+  if [[ "${arg}" == *.deb ]] || [[ "${arg}" == *.changes ]]; then
+    pkgs+=("${arg}")
+  else
+    binaries+=("${arg}")
+  fi
+done
+
+# install_raw installs raw artifacts.
+install_raw() {
+  mkdir -p "${root}/$1"
+  for binary in "${binaries[@]}"; do
+    # Copy the raw file & generate a sha512sum.
+    name=$(basename "${binary}")
+    cp -f "${binary}" "${root}/$1"
+    sha512sum "${root}/$1/${name}" | \
+        awk "{print $$1 \"  ${name}\"}" > "${root}/$1/${name}.sha512"
+  done
+}
+
+# install_apt installs an apt repository.
+install_apt() {
+  tools/make_apt.sh "${private_key}" "$1" "${root}" "${pkgs[@]}"
+}
+
+# If nightly, install only nightly artifacts.
+if [[ "${NIGHTLY:-false}" == "true" ]]; then
+  # The "latest" directory and current date.
+  stamp="$(date -Idate)"
+  install_raw "nightly/latest"
+  install_raw "nightly/${stamp}"
+  install_apt "nightly"
+else
+  # Is it a tagged release? Build that.
+  tags="$(git tag --points-at HEAD 2>/dev/null || true)"
+  if ! [[ -z "${tags}" ]]; then
+    # Note that a given commit can match any number of tags. We have to iterate
+    # through all possible tags and produce associated artifacts.
+    for tag in ${tags}; do
+      name=$(echo "${tag}" | cut -d'-' -f2)
+      base=$(echo "${name}" | cut -d'.' -f1)
+      install_raw "release/${name}"
+      install_raw "release/latest"
+      install_apt "release"
+      install_apt "${base}"
+    done
+  else
+    # Otherwise, assume it is a raw master commit.
+    install_raw "master/latest"
+    install_apt "master"
+  fi
+fi
diff --git a/tools/tag_release.sh b/tools/tag_release.sh
index 4dbfe420a..b0bab74b4 100755
--- a/tools/tag_release.sh
+++ b/tools/tag_release.sh
@@ -18,10 +18,10 @@
 # validate a provided release name, create a tag and push it. It must be
 # run manually when a release is created.
 
-set -xeu
+set -xeuo pipefail
 
 # Check arguments.
-if [ "$#" -ne 3 ]; then
+if [[ "$#" -ne 3 ]]; then
   echo "usage: $0 <commit|revid> <release.rc> <message-file>"
   exit 1
 fi
@@ -30,6 +30,12 @@ declare -r target_commit="$1"
 declare -r release="$2"
 declare -r message_file="$3"
 
+if [[ -z "${target_commit}" ]]; then
+  echo "error: <commit|revid> is empty."
+fi
+if [[ -z "${release}" ]]; then
+  echo "error: <release.rc> is empty."
+fi
 if ! [[ -r "${message_file}" ]]; then
   echo "error: message file '${message_file}' is not readable."
   exit 1
@@ -68,8 +74,9 @@ if ! [[ "${release}" =~ ^20[0-9]{6}\.[0-9]+$ ]]; then
   exit 1
 fi
 
-# Tag the given commit (annotated, to record the committer).
+# Tag the given commit (annotated, to record the committer). Note that the tag
+# here is applied as a force, in case the tag already exists and is the same.
+# The push will fail in this case (because it is not forced).
 declare -r tag="release-${release}"
-(git tag -F "${message_file}" -a "${tag}" "${commit}" && \
-  git push origin tag "${tag}") || \
-  (git tag -d "${tag}" && false)
+git tag -f -F "${message_file}" -a "${tag}" "${commit}" && \
+  git push origin tag "${tag}"
diff --git a/tools/vm/build.sh b/tools/vm/build.sh
index 098258cad..752b2b77b 100755
--- a/tools/vm/build.sh
+++ b/tools/vm/build.sh
@@ -71,7 +71,7 @@ declare internal=""
 declare -r start=$(date +%s)
 declare -r end=$((${start}+${timeout}))
 while [[ "$(date +%s)" -lt "${end}" ]] && [[ "${success}" -lt 3 ]]; do
-  echo -n "."
+  echo -n "." >&2
   if gcloud compute ssh --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
     success=$((${success}+1))
   elif gcloud compute ssh --internal-ip --zone "${ZONE}" "${USERNAME}"@"${INSTANCE_NAME}" -- true 2>/dev/null; then
diff --git a/tools/vm/defs.bzl b/tools/vm/defs.bzl
index 61feefcbc..0f67cfa92 100644
--- a/tools/vm/defs.bzl
+++ b/tools/vm/defs.bzl
@@ -60,7 +60,7 @@ def _vm_image_impl(ctx):
     # Run the builder to generate our output.
     echo = ctx.actions.declare_file(ctx.label.name)
     resolved_inputs, argv, runfiles_manifests = ctx.resolve_command(
-        command = "echo -ne \"#!/bin/bash\\necho $(%s)\\n\" > %s && chmod 0755 %s" % (
+        command = "echo -ne \"#!/bin/bash\\nset -e\\nimage=$(%s)\\necho ${image}\\n\" > %s && chmod 0755 %s" % (
             ctx.files.builder[0].path,
             echo.path,
             echo.path,
diff --git a/tools/vm/ubuntu1604/40_kokoro.sh b/tools/vm/ubuntu1604/40_kokoro.sh
index 06a1e6c48..2974f156c 100755
--- a/tools/vm/ubuntu1604/40_kokoro.sh
+++ b/tools/vm/ubuntu1604/40_kokoro.sh
@@ -43,14 +43,14 @@ done
 # junitparser is used to merge junit xml files.
 pip install junitparser
 
-# We need a kbuilder user.
-if useradd -c "kbuilder user" -m -s /bin/bash kbuilder; then
-    # User was added successfully; we add the relevant SSH keys here.
-    mkdir -p ~kbuilder/.ssh
-    (IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
-    chmod 0600 ~kbuilder/.ssh/authorized_keys
-    chown -R kbuilder ~kbuilder/.ssh
-fi
+# We need a kbuilder user, which may already exist.
+useradd -c "kbuilder user" -m -s /bin/bash kbuilder || true
+
+# We need to provision appropriate keys.
+mkdir -p ~kbuilder/.ssh
+(IFS=$'\n'; echo "${ssh_public_keys[*]}") > ~kbuilder/.ssh/authorized_keys
+chmod 0600 ~kbuilder/.ssh/authorized_keys
+chown -R kbuilder ~kbuilder/.ssh
 
 # Give passwordless sudo access.
 cat > /etc/sudoers.d/kokoro <<EOF
diff --git a/website/BUILD b/website/BUILD
index d6afd5f44..c97b2560b 100644
--- a/website/BUILD
+++ b/website/BUILD
@@ -138,7 +138,6 @@ docs(
         "//g3doc:community",
         "//g3doc:index",
         "//g3doc:roadmap",
-        "//g3doc/architecture_guide:index",
         "//g3doc/architecture_guide:performance",
         "//g3doc/architecture_guide:platforms",
         "//g3doc/architecture_guide:resources",
diff --git a/website/_config.yml b/website/_config.yml
index 3241e458c..b08602970 100644
--- a/website/_config.yml
+++ b/website/_config.yml
@@ -12,6 +12,7 @@ plugins:
   - jekyll-inline-svg
   - jekyll-relative-links
   - jekyll-feed
+  - jekyll-sitemap
 site_url: https://gvisor.dev
 feed:
   path: blog/index.xml
diff --git a/website/_includes/footer.html b/website/_includes/footer.html
index 5d9267f35..9cc8176f7 100644
--- a/website/_includes/footer.html
+++ b/website/_includes/footer.html
@@ -2,9 +2,9 @@
   {% include footer-links.html %}
 </footer>
 
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.1/js/all.min.js" integrity="sha256-Z1Nvg/+y2+vRFhFgFij7Lv0r77yG3hOvWz2wI0SfTa0=" crossorigin="anonymous"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/4.13.0/d3.min.js" integrity="sha256-hYXbQJK4qdJiAeDVjjQ9G0D6A0xLnDQ4eJI9dkm7Fpk=" crossorigin="anonymous"></script>
 
 {% if site.analytics %}
diff --git a/website/_layouts/docs.html b/website/_layouts/docs.html
index e11492915..549305089 100644
--- a/website/_layouts/docs.html
+++ b/website/_layouts/docs.html
@@ -47,11 +47,13 @@ categories:
       <h1>{{ page.title }}</h1>
       {% if page.editpath %}
         <p>
-        <a href="https://github.com/google/gvisor/edit/master/content/{{page.editpath}}" target="_blank"><i class="fa fa-edit fa-fw"></i> Edit this page</a>
+        <a href="https://github.com/google/gvisor/edit/master/{{page.editpath}}" target="_blank"><i class="fa fa-edit fa-fw"></i> Edit this page</a>
         <a href="https://github.com/google/gvisor/issues/new?title={{page.title | url_encode}}" target="_blank"><i class="fab fa-github fa-fw"></i> Create issue</a>
         </p>
       {% endif %}
+      <div class="docs-content">
       {{ content }}
+      </div>
     </div>
   </div>
 </div>
diff --git a/website/_sass/front.scss b/website/_sass/front.scss
index 44a7e3473..0e4208f3c 100644
--- a/website/_sass/front.scss
+++ b/website/_sass/front.scss
@@ -4,12 +4,14 @@
   background-repeat: no-repeat;
   background-size: cover;
   background-blend-mode: darken;
-  background-color: rgba(0, 0, 0, 0.1);
+  background-color: rgba(0, 0, 0, 0.3);
 
   p {
     color: #fff;
     margin-top: 0;
     margin-bottom: 0;
     font-weight: 300;
+    font-size: 24px;
+    line-height: 30px;
   }
 }
diff --git a/website/_sass/style.scss b/website/_sass/style.scss
index 520ea469a..4deb945d4 100644
--- a/website/_sass/style.scss
+++ b/website/_sass/style.scss
@@ -142,3 +142,13 @@ table th {
   margin-top: 10px;
   margin-bottom: 20px;
 }
+
+.docs-content * img {
+  display: block;
+  margin: 20px auto;
+}
+
+.blog-content * img {
+  display: block;
+  margin: 20px auto;
+}
diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md
index ed6d97ffe..fbdd511dd 100644
--- a/website/blog/2019-11-18-security-basics.md
+++ b/website/blog/2019-11-18-security-basics.md
@@ -56,15 +56,9 @@ in combination: redundant walls, scattered draw bridges, small bottle-neck
 entrances, moats, etc.
 
 A simplified version of the design is below
-([more detailed version](/docs/architecture_guide/))[^2]:
+([more detailed version](/docs/))[^2]:
 
---------------------------------------------------------------------------------
-
-![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png)
-
-Figure 1: Simplified design of gVisor.
-
---------------------------------------------------------------------------------
+![Figure 1](/assets/images/2019-11-18-security-basics-figure1.png "Simplified design of gVisor.")
 
 In order to discuss design principles, the following components are important to
 know:
@@ -134,13 +128,7 @@ minimum level of permission is required for it to perform its function.
 Specifically, the closer you are to the untrusted application, the less
 privilege you have.
 
---------------------------------------------------------------------------------
-
-![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png)
-
-Figure 2: runsc components and their privileges.
-
---------------------------------------------------------------------------------
+![Figure 2](/assets/images/2019-11-18-security-basics-figure2.png "runsc components and their privileges.")
 
 This is evident in how runsc (the drop in gVisor binary for Docker/Kubernetes)
 constructs the sandbox. The Sentry has the least privilege possible (it can't
@@ -222,15 +210,7 @@ the host Linux syscalls. In other words, with gVisor, applications get the vast
 majority (and growing) functionality of Linux containers for only 68 possible
 syscalls to the Host OS. 350 syscalls to 68 is attack surface reduction.
 
---------------------------------------------------------------------------------
-
-![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png)
-
-Figure 3: Reduction of Attack Surface of the Syscall Table. Note that the
-Senty's Syscall Emulation Layer keeps the Containerized Process from ever
-calling the Host OS.
-
---------------------------------------------------------------------------------
+![Figure 3](/assets/images/2019-11-18-security-basics-figure3.png "Reduction of Attack Surface of the Syscall Table. Note that the Senty's Syscall Emulation Layer keeps the Containerized Process from ever calling the Host OS.")
 
 ## Secure-by-default
 
diff --git a/website/blog/2020-04-02-networking-security.md b/website/blog/2020-04-02-networking-security.md
index 78f0a6714..5a5e38fd7 100644
--- a/website/blog/2020-04-02-networking-security.md
+++ b/website/blog/2020-04-02-networking-security.md
@@ -69,13 +69,7 @@ a similar syscall). Moreover, because packets typically come from off-host (e.g.
 the internet), the Host OS's packet processing code has received a lot of
 scrutiny, hopefully resulting in a high degree of hardening.
 
---------------------------------------------------------------------------------
-
-![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png)
-
-Figure 1: Netstack and gVisor
-
---------------------------------------------------------------------------------
+![Figure 1](/assets/images/2020-04-02-networking-security-figure1.png "Network and gVisor.")
 
 ## Writing a network stack
 
diff --git a/website/cmd/syscalldocs/main.go b/website/cmd/syscalldocs/main.go
index 62d293a05..327537214 100644
--- a/website/cmd/syscalldocs/main.go
+++ b/website/cmd/syscalldocs/main.go
@@ -46,7 +46,7 @@ type SyscallDoc struct {
 }
 
 var mdTemplate = template.Must(template.New("out").Parse(`---
-title: {{.OS}}/{{.Arch}}
+title: {{.Title}}
 description: Syscall Compatibility Reference Documentation for {{.OS}}/{{.Arch}}
 layout: docs
 category: Compatibility
@@ -134,6 +134,7 @@ func main() {
 
 			weight += 10
 			data := struct {
+				Title        string
 				OS           string
 				Arch         string
 				Weight       int
@@ -149,7 +150,8 @@ func main() {
 					URLs    []string
 				}
 			}{
-				OS:           strings.Title(osName),
+				Title:        strings.Title(osName) + "/" + archName,
+				OS:           osName,
 				Arch:         archName,
 				Weight:       weight,
 				Total:        0,
diff --git a/website/index.md b/website/index.md
index 34d3ee23d..84f877d49 100644
--- a/website/index.md
+++ b/website/index.md
@@ -3,10 +3,10 @@
     <div class="row">
       <div class="col-md-3"></div>
       <div class="col-md-6">
-        <p>gVisor is an <b>application kernel</b> and <b>container runtime</b> providing defense-in-depth for containers <em>anywhere</em>.</p>
+        <p>gVisor is an <b>application kernel</b> for <b>containers</b> that provides efficient defense-in-depth anywhere.</p>
         <p style="margin-top: 20px;">
+          <a class="btn" href="/docs/user_guide/quick_start/docker/">Quick start&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
           <a class="btn" href="/docs/">Learn More&nbsp;<i class="fas fa-arrow-alt-circle-right ml-2"></i></a>
-          <a class="btn btn-inverse" href="https://github.com/google/gvisor">GitHub&nbsp;<i class="fab fa-github ml-2"></i></a>
         </p>
       </div>
       <div class="col-md-3"></div>
@@ -19,8 +19,8 @@
 <div class="row">
   <div class="col-md-4">
     <h4 id="seamless-security">Container-native Security <i class="fas fa-lock"></i></h4>
-    <p>By providing each container with its own userspace kernel, gVisor limits
-    the attack surface of the host. This protection does not limit
+    <p>By providing each container with its own application kernel, gVisor
+    limits the attack surface of the host. This protection does not limit
     functionality: gVisor runs unmodified binaries and integrates with container
     orchestration systems, such as Docker and Kubernetes, and supports features
     such as volumes and sidecars.</p>